harsh-dev commited on
Commit
4225666
·
1 Parent(s): c44ea2b

docker deployment

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +60 -0
  2. .env.example +3 -0
  3. .gitattributes +2 -34
  4. .gitignore +57 -0
  5. CODEBASE_DOCUMENTATION.md +673 -0
  6. College_Overview2.md +29 -0
  7. DOCUMENTATION_PLAN.md +277 -0
  8. Dockerfile +66 -0
  9. LOCAL_MODEL_TRUNCATION_FIX.md +136 -0
  10. MARKDOWN_FIX_SUMMARY.md +118 -0
  11. README.md +65 -10
  12. WHY_LOCAL_NOT_WORKING.md +112 -0
  13. app/__init__.py +0 -0
  14. app/api/__init__.py +5 -0
  15. app/api/dependencies.py +41 -0
  16. app/api/routes/__init__.py +5 -0
  17. app/api/routes/rag.py +186 -0
  18. app/api/routes/settings.py +186 -0
  19. app/api/routes/vector_store.py +311 -0
  20. app/api/schemas/__init__.py +1 -0
  21. app/api/schemas/requests.py +71 -0
  22. app/api/schemas/settings.py +54 -0
  23. app/api/schemas/tests.py +30 -0
  24. app/core/__init__.py +0 -0
  25. app/core/config.py +55 -0
  26. app/core/paths.py +10 -0
  27. app/main.py +21 -0
  28. app/models/__init__.py +0 -0
  29. app/prompts/__init__.py +1 -0
  30. app/prompts/system_prompts.py +112 -0
  31. app/services/__init__.py +2 -0
  32. app/services/classifier_service.py +337 -0
  33. app/services/document_loader.py +34 -0
  34. app/services/file_service.py +198 -0
  35. app/services/filter-demo +197 -0
  36. app/services/filter_classifier copy.py +334 -0
  37. app/services/filter_classifier.py +529 -0
  38. app/services/hybrid_retrieval.py +354 -0
  39. app/services/ingestion_service.py +95 -0
  40. app/services/rag_service.py +483 -0
  41. app/services/text_splitter.py +266 -0
  42. app/services/vector_store.py +67 -0
  43. app/utils/__init__.py +5 -0
  44. app/utils/constants.py +264 -0
  45. app/utils/document_helpers.py +231 -0
  46. app/utils/embeddings.py +11 -0
  47. app/utils/llm_models.py +20 -0
  48. app/utils/model_factory.py +164 -0
  49. app/utils/preprocessing.py +107 -0
  50. app/utils/tests.py +0 -0
.dockerignore ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ─── Python ───────────────────────────────────────────────────────────────────
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ .eggs/
7
+
8
+ # ─── Virtual environments ─────────────────────────────────────────────────────
9
+ .venv/
10
+ venv/
11
+ env/
12
+
13
+ # ─── Environment / secrets ────────────────────────────────────────────────────
14
+ .env
15
+ .env.*
16
+
17
+ # ─── Git ──────────────────────────────────────────────────────────────────────
18
+ .git/
19
+ .gitignore
20
+
21
+ # ─── Large LLM model files (not needed — Gemini-only mode) ───────────────────
22
+ ml_models/llm/
23
+ ml_models/embeddings/bge-small/
24
+
25
+ # ─── Dev/test files not needed in production ──────────────────────────────────
26
+ tests/
27
+ docs/
28
+ results/
29
+ temp/
30
+ old/
31
+ scripts/
32
+ dump/
33
+
34
+ # ─── Root-level scratch/demo scripts ─────────────────────────────────────────
35
+ bm25.py
36
+ cfs.py
37
+ classifier-demo.py
38
+ fileService.py
39
+ hybrid_search.py
40
+ rewrite_query.py
41
+ testSearch.py
42
+ test_json_spliting.py
43
+ test_markdown_splitter.py
44
+
45
+ # ─── Large PDF files ──────────────────────────────────────────────────────────
46
+ *.pdf
47
+
48
+ # ─── Documentation ────────────────────────────────────────────────────────────
49
+ *.md
50
+ !readme.md
51
+
52
+ # ─── IDE / OS ─────────────────────────────────────────────────────────────────
53
+ .vscode/
54
+ .idea/
55
+ *.swp
56
+ .DS_Store
57
+ Thumbs.db
58
+
59
+ # ─── Second requirements file (unused) ───────────────────────────────────────
60
+ req.txt
.env.example ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ GOOGLE_API_KEY=
2
+ LLM_PROVIDER=gemini # or "local"
3
+ ENABLE_FALLBACK=true
.gitattributes CHANGED
@@ -1,38 +1,6 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.gguf filter=lfs diff=lfs merge=lfs -text
37
  *.sqlite3 filter=lfs diff=lfs merge=lfs -text
38
  *.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.pkl filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
4
  *.gguf filter=lfs diff=lfs merge=lfs -text
5
  *.sqlite3 filter=lfs diff=lfs merge=lfs -text
6
  *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./models
2
+ .venv
3
+ cache
4
+ __pycache__
5
+ .env
6
+ Qwen2.5-0.5B-Instruct-Q4_K_M.gguf
7
+
8
+ # Python
9
+ __pycache__/
10
+ *.py[cod]
11
+ *$py.class
12
+ .venv/
13
+ venv/
14
+ *.egg-info/
15
+
16
+ # Environment
17
+ .env
18
+
19
+ # Data (runtime files)
20
+ data/
21
+ ml_models/
22
+
23
+ # IDE
24
+ .vscode/
25
+ .idea/
26
+ *.swp
27
+
28
+ # Temporary files
29
+ temp/
30
+ *.tmp
31
+
32
+ # OS
33
+ .DS_Store
34
+ Thumbs.db
35
+
36
+ # ML model files (large binary files)
37
+ ml_models/**/*.gguf
38
+ ml_models/**/*.bin
39
+ ml_models/**/*.safetensors
40
+
41
+ # Keep directory structure
42
+ !ml_models/.gitkeep
43
+ !ml_models/llm/.gitkeep
44
+
45
+
46
+ # In .gitignore — add these exceptions:
47
+ !data/
48
+ !data/vector_stores/
49
+ !data/vector_stores/classifier_test_1/
50
+ !data/vector_stores/classifier_test_1/**
51
+
52
+ !ml_models/
53
+ !ml_models/classifier/
54
+ !ml_models/classifier/chatbot_classifier.pkl
55
+ !ml_models/embeddings/mdbr-leaf-mt/
56
+ !ml_models/embeddings/mdbr-leaf-mt/**
57
+ !data/classifier_test_1.json
CODEBASE_DOCUMENTATION.md ADDED
@@ -0,0 +1,673 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VGEC RAG Chatbot — Codebase Documentation
2
+
3
+ > **Generated:** 2026-03-25
4
+ > **Version:** 1.0.0
5
+ > **Scope:** Full system — ingestion, retrieval, classification, API, evaluation
6
+
7
+ ---
8
+
9
+ ## Table of Contents
10
+
11
+ 1. [Project Overview](#1-project-overview)
12
+ 2. [System Architecture](#2-system-architecture)
13
+ 3. [Schema & Data Model](#3-schema--data-model)
14
+ 4. [Retrieval Pipeline](#4-retrieval-pipeline)
15
+ 5. [Key Classes & Modules](#5-key-classes--modules)
16
+ 6. [Evaluation & Metrics](#6-evaluation--metrics)
17
+ 7. [Known Limitations](#7-known-limitations)
18
+ 8. [File Structure](#8-file-structure)
19
+
20
+ ---
21
+
22
+ ## 1. Project Overview
23
+
24
+ ### Purpose
25
+
26
+ **VGEC RAG Chatbot** is a Retrieval-Augmented Generation (RAG) chatbot for **Vishwakarma Government Engineering College (VGEC), Chandkheda, Gujarat**. It allows students, faculty, and visitors to query structured information about the institution — departments, faculty, syllabus, labs, intake capacity, and more — through natural language.
27
+
28
+ ### Domain
29
+
30
+ - **Institution:** VGEC (Government Engineering College, Gujarat)
31
+ - **Data Coverage:** Department-level information for multiple disciplines (Computer Engineering, Civil, Electrical, IT, ECE, etc.)
32
+ - **Topics:** Faculty lists, lab facilities, syllabus details, HOD info, research activities, intake capacity, achievements
33
+
34
+ ### Tech Stack
35
+
36
+ | Layer | Technology |
37
+ |---|---|
38
+ | **API Framework** | FastAPI |
39
+ | **Vector Database** | ChromaDB (persistent, local) |
40
+ | **Embeddings** | Google `gemini-embedding-001` (via `langchain-google-genai`) |
41
+ | **LLM (Cloud)** | Google Gemini `gemini-2.5-flash-lite` |
42
+ | **LLM (Local)** | `EXAONE-3.5-2.4B-Instruct-Q4_K_M.gguf` via `llama-cpp-python` |
43
+ | **NLP / Preprocessing** | spaCy (`en_core_web_sm`), NLTK (PorterStemmer) |
44
+ | **Classifier** | Scikit-learn `LogisticRegression` + `SentenceTransformer` (`MongoDB/mdbr-leaf-mt`) |
45
+ | **BM25** | `langchain-community` `BM25Retriever` |
46
+ | **Chunking** | LangChain `RecursiveCharacterTextSplitter` |
47
+ | **Config** | Pydantic `BaseSettings` (`.env`-backed) |
48
+
49
+ ### Key Features Implemented
50
+
51
+ - ✅ Structured JSON ingestion with intent-aware chunking
52
+ - ✅ Hybrid retrieval: BM25 + vector search fused via Reciprocal Rank Fusion (RRF)
53
+ - ✅ Intent/metadata classification with confidence-gated ChromaDB filters
54
+ - ✅ Abbreviation expansion (`CE` → `Computer Engineering`, etc.)
55
+ - ✅ Multi-turn conversation history support
56
+ - ✅ Dual LLM backend with automatic fallback (Gemini ↔ Local)
57
+ - ✅ Full CRUD REST API for vector store management
58
+ - ✅ Offline evaluation endpoint (MRR, hit rate, noise rate)
59
+ - ✅ Classifier accuracy evaluation endpoint
60
+
61
+ ---
62
+
63
+ ## 2. System Architecture
64
+
65
+ ### Component Diagram
66
+
67
+ ```
68
+ ┌──────────────────────────┐
69
+ │ FastAPI App │
70
+ │ /api/v1/rag /vector │
71
+ └──────────┬───────────────┘
72
+ │ DI (lru_cache)
73
+ ┌──────────▼───────────────┐
74
+ │ RAGService │
75
+ │ (core orchestrator) │
76
+ └──┬───────────┬────────────┘
77
+ │ │
78
+ ┌─────────────▼──┐ ┌───▼──────────────────┐
79
+ │ IngestionService│ │ HybridRetrievalService│
80
+ │ (write path) │ │ (read path) │
81
+ └──────┬──────── ┘ └───┬──────────┬─────── ┘
82
+ │ │ │
83
+ ┌──────────▼──┐ ┌──────────▼──┐ ┌────▼──────────┐
84
+ │ FileService │ │ ClassifierSvc│ │ VectorStore │
85
+ │ (file +meta) │ │(clf predict) │ │ (ChromaDB) │
86
+ └──────────────┘ └─────────────┘ └───────────────┘
87
+ ```
88
+
89
+ ### Data Flow
90
+
91
+ #### Ingestion Path
92
+
93
+ ```
94
+ File Upload (PDF/MD/TXT/JSON)
95
+
96
+
97
+ FileService.read_file() ← type-aware loading (PyMuPDF for PDF)
98
+ │ returns: Document + metadata
99
+
100
+ FileService.write_file() ← persist copy to data/documents/
101
+
102
+
103
+ IngestionService.handle_*_docs() ← route by file extension
104
+
105
+ ├─ JSON → handle_json_docs() ← intent-aware chunks (list / detail / count)
106
+ └─ text → handle_text_docs() ← RecursiveCharacterTextSplitter + normalize()
107
+
108
+
109
+ VectorStore.add_documents() ← embed + upsert into ChromaDB
110
+
111
+
112
+ FileService.patch_metadata() ← update ingestion record JSON (chunk count, timing, size)
113
+ ```
114
+
115
+ #### Query Path
116
+
117
+ ```
118
+ User Question
119
+
120
+
121
+ preprocess_query() ← tokenize + strip stopwords (spaCy) + normalize
122
+
123
+
124
+ HybridRetrievalService.retrieve()
125
+
126
+ ├─ clf.expand_abbreviations() ← CE → Computer Engineering
127
+ ├─ clf.predict_with_filter() ← LogReg predict → Chroma $and/$or filter
128
+ ├─ _vector_rank() ← ChromaDB similarity_search_with_score (k=15)
129
+ ├─ _bm25_rank() ← BM25 over the vector candidate pool
130
+ ├─ _reciprocal_rank_fusion() ← weighted RRF merge
131
+ ├─ metadata score boosting ← multiply fused scores for confident matches
132
+ └─ _apply_title_boost() ← per-query-word title match bonus
133
+
134
+
135
+ get_references_v2() ← filter by threshold, build context string
136
+
137
+
138
+ LLM.invoke(prompt) ← Gemini or local LlamaCpp
139
+
140
+
141
+ Return: { answer, references, context, threshold_used, k_used }
142
+ ```
143
+
144
+ ### External Dependencies
145
+
146
+ | Dependency | Role | Provider |
147
+ |---|---|---|
148
+ | ChromaDB | Persistent vector store | Local disk |
149
+ | Google Gemini API | Embeddings + LLM generation | Google Cloud |
150
+ | LlamaCpp (GGUF model) | Local LLM fallback | Local CPU |
151
+ | Sentence Transformers | Classifier feature extraction | HuggingFace Hub |
152
+ | spaCy `en_core_web_sm` | POS tagging / lemmatization | Local |
153
+
154
+ ---
155
+
156
+ ## 3. Schema & Data Model
157
+
158
+ ### Source JSON Format
159
+
160
+ Source data files (e.g. `computer_eng.json`) follow this schema:
161
+
162
+ ```json
163
+ {
164
+ "id": "computer-engineering-department",
165
+ "name": "Computer Engineering Department",
166
+ "source": "https://www.vgecg.ac.in/department.php?dept=3",
167
+ "category": "computer_eng",
168
+ "type": "department",
169
+ "created_date": "2026-02-19",
170
+ "content": {
171
+ "<topic_key>": {
172
+ "list": ["item 1", "item 2", "..."],
173
+ "details": "Paragraph describing the topic."
174
+ }
175
+ }
176
+ }
177
+ ```
178
+
179
+ **Top-level fields:**
180
+
181
+ | Field | Type | Description |
182
+ |---|---|---|
183
+ | `id` | string | Unique document identifier |
184
+ | `name` | string | Human-readable institution/department name |
185
+ | `source` | string | Authoritative URL |
186
+ | `category` | string | Department slug (e.g. `computer_eng`) |
187
+ | `type` | string | Document type (e.g. `department`) |
188
+ | `created_date` | string (ISO) | Data creation date |
189
+ | `content` | object | Topic map; each key = a topic |
190
+
191
+ ### Chunk Metadata Schema (stored in ChromaDB)
192
+
193
+ Every vector chunk stored in Chroma carries the following metadata:
194
+
195
+ | Field | Type | Source |
196
+ |---|---|---|
197
+ | `id` | string (UUID) | Auto-generated |
198
+ | `title` | string | Document name / topic key |
199
+ | `source` | string | Source URL |
200
+ | `source_file` | string | Filename (e.g. `computer_eng.json`) |
201
+ | `type` | string | Taxonomy level 1 (e.g. `department`) |
202
+ | `category` | string | Taxonomy level 2 (e.g. `computer_eng`) |
203
+ | `topic` | string | Taxonomy level 3 (e.g. `faculty`) |
204
+ | `intent` | string | Chunk intent: `list`, `detail`, or `count` |
205
+ | `chunk_index` | int | Sequential index within file |
206
+ | `created_date` | string (ISO) | Ingestion timestamp |
207
+ | `updated_at` | string (ISO) | Last modification timestamp |
208
+ | `ext` | string | Source file extension (`json`, `pdf`, `md`, `txt`) |
209
+
210
+ ### Hierarchical Taxonomy
211
+
212
+ The classifier predicts and ChromaDB filters operate on a 3-level hierarchy:
213
+
214
+ ```
215
+ type
216
+ └── category
217
+ └── topic
218
+ └── intent (list | detail | count)
219
+ ```
220
+
221
+ **Example mapping (Computer Engineering):**
222
+
223
+ ```
224
+ type: "department"
225
+ └── category: "computer_eng"
226
+ ├── topic: "faculty" → intent: list | detail
227
+ ├── topic: "lab" → intent: list | detail
228
+ ├── topic: "syllabus" → intent: list | detail
229
+ ├── topic: "hod" → intent: list | detail
230
+ ├── topic: "intake" → intent: list | detail
231
+ ├── topic: "research" → intent: list | detail
232
+ └── topic: "achievements"
233
+ ```
234
+
235
+ ### Document Chunking Strategy
236
+
237
+ **JSON documents** use a hand-crafted, intent-aware strategy in `IngestionService.handle_json_docs()`:
238
+
239
+ | Intent | Chunk Content | Metadata |
240
+ |---|---|---|
241
+ | `list` | Numbered list: `1. item\n2. item\n...` | `intent=list` |
242
+ | `count` | `"Total <topic>: N"` (auto-generated) | `intent=count` |
243
+ | `detail` | Raw paragraph text | `intent=detail` |
244
+
245
+ **Text/PDF/Markdown documents** use `RecursiveCharacterTextSplitter`:
246
+ - Default: `chunk_size=500`, `chunk_overlap=100`
247
+ - Separator priority: `\n\n` → `\n` → ` ` → (character)
248
+ - Markdown variant respects `---` section delimiters
249
+ - Content is passed through `normalize()` (tokenize + strip blanks) before storage
250
+
251
+ ---
252
+
253
+ ## 4. Retrieval Pipeline
254
+
255
+ ### Query Processing Flow
256
+
257
+ ```python
258
+ # Step 1: Normalize input
259
+ question = preprocess_query(question)
260
+ # → spaCy POS filter (NOUN, PROPN, VERB, NUM, ADJ) + lemmatize + strip stopwords
261
+
262
+ # Step 2: Expand abbreviations
263
+ processed_query = clf.expand_abbreviations(query)
264
+ # → "CE dept" → "computer engineering department"
265
+
266
+ # Step 3: Classify intent/metadata
267
+ filters = clf.predict_with_filter([processed_query])
268
+ # → {"$and": [{"type": "department"}, {"intent": "list"}, {"$or": [...]}]}
269
+
270
+ # Step 4: Vector search with optional filter
271
+ raw_results = chroma.similarity_search_with_score(query, k=15, filter=filters)
272
+ # Fallback: if filtered results empty, retry without filter
273
+
274
+ # Step 5: BM25 re-rank over vector candidates
275
+ bm25_results = BM25Retriever.from_documents(candidate_docs)
276
+
277
+ # Step 6: RRF fusion
278
+ fused_score(d) = bm25_weight * 1/(rrf_k + rank_bm25)
279
+ + vector_weight * 1/(rrf_k + rank_vec)
280
+
281
+ # Step 7: Metadata confidence boosting
282
+ if doc.metadata[field] == predicted_val and conf > 0.90:
283
+ result.fused_score *= boost_factor # 1.10–1.20
284
+
285
+ # Step 8: Title word boost
286
+ for word in query_words:
287
+ if word in doc.title:
288
+ result.fused_score += title_boost_per_word # 0.004
289
+
290
+ # Step 9: Threshold filter + sort + top-k
291
+ results = [r for r in results if r.fused_score >= threshold]
292
+ ```
293
+
294
+ ### Classifier Thresholds
295
+
296
+ The `Classifier` uses two separate threshold tables:
297
+
298
+ **Prediction threshold** — below this, the field is set to `None` (not used at all):
299
+
300
+ | Field | Threshold |
301
+ |---|---|
302
+ | `type` | 0.40 |
303
+ | `category` | 0.40 |
304
+ | `topic` | 0.50 |
305
+ | `intent` | 0.60 |
306
+
307
+ **Filter threshold** — above this, the field becomes a hard ChromaDB `$and` filter:
308
+
309
+ | Field | Threshold |
310
+ |---|---|
311
+ | `type` | 0.65 |
312
+ | `category` | 0.65 |
313
+ | `topic` | 0.70 |
314
+
315
+ ### Filter Construction Logic (`_build_filter`)
316
+
317
+ ```python
318
+ # Gate: if type confidence < 0.65 → return None (full scan)
319
+ # Hard anchors (always included if type passes):
320
+ # - type == predicted_type
321
+ # - intent == predicted_intent (special: "count" expands to count OR detail)
322
+ # Soft hints (combined as $or):
323
+ # - category == predicted_category (if conf >= 0.65, else "general")
324
+ # - topic == predicted_topic (if conf >= 0.70, else "general")
325
+ ```
326
+
327
+ ### Hybrid Retrieval Config (Defaults)
328
+
329
+ | Parameter | `hybrid_query` | `search_docs` |
330
+ |---|---|---|
331
+ | `candidate_k` | 15 | 15 |
332
+ | `top_k` (final) | `settings.similarity_top_k` (8) | k (param) |
333
+ | `bm25_weight` | 0.45 | 0.70 |
334
+ | `vector_weight` | 0.55 | 0.30 |
335
+ | `rrf_k` | 20 | 20 |
336
+ | `bm25_k1` | 1.2 | 1.5 |
337
+ | `bm25_b` | 0.9 | 0.75 |
338
+ | `title_boost_per_word` | 0.004 | 0.004 |
339
+ | `score_threshold` | 0.4 | 0.4 |
340
+
341
+ > **Note:** `search_docs` is BM25-heavy (0.70) since it is used for keyword-oriented document browsing, while `hybrid_query` is vector-heavy for semantic QA.
342
+
343
+ ---
344
+
345
+ ## 5. Key Classes & Modules
346
+
347
+ ### Services (`app/services/`)
348
+
349
+ #### `RAGService`
350
+
351
+ Main orchestrator. Singleton via `lru_cache` in `dependencies.py`.
352
+
353
+ | Method | Description |
354
+ |---|---|
355
+ | `query()` | Semantic-only QA (vector search → LLM) |
356
+ | `hybrid_query()` | Hybrid QA (BM25 + vector → RRF → LLM) |
357
+ | `search_docs()` | BM25-heavy document search, no LLM |
358
+ | `ingest_documents()` | Ingest a file path into the vector store |
359
+ | `get_filenames()` | Return all tracked file metadata records |
360
+ | `test_queries()` | Batch retrieval evaluation (MRR, precision, noise) |
361
+ | `test_classifier()` | Batch classifier accuracy evaluation |
362
+ | `delete_database()` | Drop the entire ChromaDB collection |
363
+
364
+ #### `HybridRetrievalService`
365
+
366
+ Stateless per-request service created inline by `RAGService`.
367
+
368
+ | Method | Description |
369
+ |---|---|
370
+ | `retrieve(query)` | Full hybrid retrieval pipeline; returns `List[RetrievalResult]` |
371
+ | `_vector_rank()` | Chroma similarity search + classifier filter |
372
+ | `_bm25_rank()` | BM25 over candidate pool |
373
+ | `_reciprocal_rank_fusion()` | Merge both ranked lists via RRF |
374
+ | `_apply_title_boost()` | Word-level title match score bonus |
375
+
376
+ **`RetrievalResult` dataclass:**
377
+
378
+ ```python
379
+ @dataclass
380
+ class RetrievalResult:
381
+ document: Document
382
+ fused_score: float
383
+ bm25_rank: Optional[int]
384
+ vector_rank: Optional[int]
385
+ title_boost: float
386
+ ```
387
+
388
+ #### `Classifier`
389
+
390
+ Loaded at startup from a pickled pipeline (`chatbot_classifier.pkl`).
391
+
392
+ | Method | Description |
393
+ |---|---|
394
+ | `predict(queries)` | Returns list of `{type, category, topic, intent, *_conf}` dicts |
395
+ | `predict_with_filter(queries)` | Returns a ChromaDB-compatible filter dict or `None` |
396
+ | `expand_abbreviations(text)` | Regex-based abbreviation expansion |
397
+ | `get_features(queries)` | Build `[SentenceTransformer embedding | TF-IDF]` feature matrix |
398
+ | `train_models(df)` | Train 4 LogisticRegression classifiers (offline use) |
399
+
400
+ #### `IngestionService`
401
+
402
+ | Method | Description |
403
+ |---|---|
404
+ | `ingest(file_path)` | Load + chunk a file; returns `List[Document]` |
405
+ | `handle_json_docs()` | Intent-aware chunking for structured JSON data |
406
+ | `handle_text_docs()` | Recursive character splitting for unstructured text |
407
+ | `get_records()` | Delegate to `FileService.get_records()` |
408
+ | `delete_record(filename)` | Remove a file's metadata record |
409
+ | `path_record(path, metadata)` | Patch ingestion stats after indexing |
410
+
411
+ #### `FileService`
412
+
413
+ | Method | Description |
414
+ |---|---|
415
+ | `read_file(path)` | Load file content; dispatches by extension |
416
+ | `write_file(path, content, metadata)` | Persist file to `data/documents/` |
417
+ | `patch_metadata(path, metadata)` | Merge new fields into existing record |
418
+ | `get_records()` | Return all ingestion records dict |
419
+ | `delete_record(filename)` | Remove a record from `<collection>.json` |
420
+
421
+ #### `VectorStore`
422
+
423
+ Thin wrapper around `langchain_chroma.Chroma`.
424
+
425
+ | Method | Description |
426
+ |---|---|
427
+ | `get()` | Retrieve all documents |
428
+ | `get_by_id(ids)` | Retrieve specific documents by ID |
429
+ | `add_documents(docs)` | Embed + insert, skipping empty chunks |
430
+ | `update_document(id, doc)` | Delete then re-insert with same ID |
431
+ | `delete(ids)` | Remove documents by ID list |
432
+ | `similarity_search_with_score()` | Wrapped Chroma search |
433
+
434
+ ### Utilities (`app/utils/`)
435
+
436
+ #### `preprocessing.py`
437
+
438
+ | Function | Description |
439
+ |---|---|
440
+ | `preprocess(text)` | spaCy POS filter + lemmatize + stopword removal → joined string |
441
+ | `normalize(text)` | Tokenize + strip blanks (lightweight, no POS) |
442
+ | `preprocess_query(query)` | Applies `normalize()` to user queries |
443
+ | `preprocess_documents(docs)` | Applies `preprocess()` to a document list in-place |
444
+ | `preprocess_filename(path)` | Sanitize filename (remove special chars, lowercase) |
445
+
446
+ #### `document_helpers.py`
447
+
448
+ | Function | Description |
449
+ |---|---|
450
+ | `get_references_v2(docs, threshold)` | Convert `RetrievalResult` list → references dict + context string |
451
+ | `get_references(docs, threshold)` | Same for raw `(Document, distance)` tuples (used by `query()`) |
452
+ | `build_metadata(path)` | Parse YAML frontmatter from `.md`/`.txt` files |
453
+ | `create_documents(chunks, ...)` | Attach standard metadata (UUID, timestamps, indices) to chunks |
454
+ | `create_documents_from_text(text)` | Full pipeline: frontmatter parse → split → metadata attach |
455
+ | `clean_metadata(metadata)` | Serialize datetime, coerce non-allowed types to string |
456
+
457
+ #### `model_factory.py`
458
+
459
+ | Function | Description |
460
+ |---|---|
461
+ | `get_embedding_model()` | Returns `GoogleGenerativeAIEmbeddings` |
462
+ | `get_gemini_model()` | Returns `ChatGoogleGenerativeAI` |
463
+ | `get_local_model()` | Returns `ChatLlamaCpp` (GGUF, CPU inference) |
464
+ | `get_llm_model(provider)` | Dispatches to Gemini or Local with fallback logic |
465
+
466
+ ### API Routes (`app/api/routes/`)
467
+
468
+ #### `rag.py` — prefix `/api/v1/rag`
469
+
470
+ | Method | Endpoint | Description |
471
+ |---|---|---|
472
+ | GET | `/` | Health check |
473
+ | POST | `/` | Semantic query |
474
+ | POST | `/hybrid_query` | Hybrid RAG query (primary endpoint) |
475
+ | POST | `/similarity_search` | Hybrid retrieval, no LLM response |
476
+ | POST | `/search` | BM25-heavy document search |
477
+ | POST | `/test` | Batch retrieval evaluation |
478
+ | POST | `/test_classifier` | Classifier accuracy evaluation |
479
+ | GET | `/test_classifier_dataset` | Run built-in test dataset, cache result |
480
+
481
+ #### `vector_store.py` — prefix `/api/v1/vector`
482
+
483
+ | Method | Endpoint | Description |
484
+ |---|---|---|
485
+ | GET | `/` | List all documents (paginated, filterable) |
486
+ | GET | `/filenames` | List ingested file records |
487
+ | GET | `/{id}` | Get single document by ChromaDB ID |
488
+ | POST | `/` | Upload + ingest file |
489
+ | PUT | `/{id}` | Update document content/metadata |
490
+ | DELETE | `/ids` | Bulk delete by ID list |
491
+ | DELETE | `/{id}` | Delete single document |
492
+ | DELETE | `/` | Filter-based delete (filename/source/contains) |
493
+
494
+ ### Configuration (`app/core/config.py`)
495
+
496
+ All settings are read from `.env` via Pydantic `BaseSettings`:
497
+
498
+ ```python
499
+ class Settings(BaseSettings):
500
+ # Paths
501
+ collection_name: str = "classifier_test_1"
502
+ persist_directory: str = "./data/vector_stores/classifier_test_1"
503
+
504
+ # Chunking
505
+ chunk_size: int = 500
506
+ chunk_overlap: int = 100
507
+
508
+ # Retrieval
509
+ similarity_top_k: int = 8
510
+ similarity_threshold: float = 0.4
511
+
512
+ # LLM Provider
513
+ llm_provider: Literal["gemini", "local"] = "local"
514
+ enable_fallback: bool = True
515
+
516
+ # Models
517
+ embedding_model_name: str = "models/gemini-embedding-001"
518
+ gemini_model_name: str = "gemini-2.5-flash-lite"
519
+ local_model_name: str = "EXAONE-3.5-2.4B-Instruct-Q4_K_M.gguf"
520
+
521
+ # Generation
522
+ max_output_tokens: int = 2048
523
+ local_max_tokens: int = 512
524
+
525
+ # Auth
526
+ google_api_key: str # required — must be in .env
527
+ ```
528
+
529
+ ---
530
+
531
+ ## 6. Evaluation & Metrics
532
+
533
+ ### Retrieval Evaluation (`test_queries` / `POST /api/v1/rag/test`)
534
+
535
+ Tests each (question, expected_document, expected_chunk_index) triple against `hybrid_query`:
536
+
537
+ | Metric | Formula | Interpretation |
538
+ |---|---|---|
539
+ | **Hit Rate** | `hits / total` | % of questions where the exact chunk was retrieved |
540
+ | **Top-1 Hit Rate** | `rank==1 hits / total` | % of questions where exact chunk was top result |
541
+ | **MRR** | `mean(1/rank)` | Mean Reciprocal Rank; higher = correct result ranked earlier |
542
+ | **Doc Precision** | `correct_source_chunks / all_chunks` | How many retrieved chunks came from the right document |
543
+ | **Doc Recall** | `1 if any correct_source_chunk else 0` | Did we retrieve at least one chunk from the right document? |
544
+ | **Doc Noise** | `wrong_source_chunks / all_chunks` | Proportion of off-topic chunks in the result set |
545
+ | **Error Rate** | `1 - hit_rate` | Miss rate for exact chunk retrieval |
546
+
547
+ **Test Input Schema:**
548
+
549
+ ```python
550
+ class TestRequestSchema(BaseModel):
551
+ tests: List[Test] # question + document + chunk_index
552
+ k: int = 5
553
+ threshold: float = 0.4
554
+ ```
555
+
556
+ ### Classifier Evaluation (`test_classifier` / `POST /api/v1/rag/test_classifier`)
557
+
558
+ Evaluates predictions for all 4 classification fields (`type`, `category`, `topic`, `intent`):
559
+
560
+ | Metric | Notes |
561
+ |---|---|
562
+ | **Accuracy** | `sklearn.accuracy_score` |
563
+ | **Precision (macro)** | `zero_division=0` |
564
+ | **Recall (macro)** | `zero_division=0` |
565
+ | **F1 Macro** | Unweighted average across classes |
566
+ | **F1 Weighted** | Class-frequency weighted |
567
+ | **Classification Report** | Full per-class breakdown (`output_dict=True`) |
568
+
569
+ A bundled test dataset is stored in `app/utils/tests.py` as `classifier_test_dataset` and can be executed via `GET /api/v1/rag/test_classifier_dataset`. Results are **memoized** on the `RAGService.evaluation` dict for the lifetime of the server process.
570
+
571
+ ---
572
+
573
+ ## 7. Known Limitations
574
+
575
+ ### Technical Debt
576
+
577
+ - **`preprocess_query` is incomplete.** The function signature has an LLM-powered query rewriting block that is commented out. Currently it just calls `normalize()` (tokenize only), which means no stopword removal or lemmatization is applied to user queries (only to stored documents).
578
+ - **`search_docs` does not honour `filename` as a metadata filter in Chroma.** The filter is applied in Python post-retrieval, which is inefficient for large collections.
579
+ - **Count intent is synthetic.** The `"Total <topic>: N"` chunk is an auto-generated chunk during ingestion, not from the source document. If source data changes, stale count chunks can remain indexed.
580
+ - **`VectorStore.get_dict()` has a `print(type(rows))`** debug statement left in production code.
581
+ - **`FileService.__init__` docstring** has an extra backtick: `"`\`` class docstring`.
582
+
583
+ ### Planned but Unimplemented
584
+
585
+ - **Query rewriting via local LLM** — skeleton is commented out in `preprocess_query()`.
586
+ - **Semantic caching** — no query result memoization at the API layer.
587
+ - **Re-ranker** — no cross-encoder re-ranking step; relies only on RRF + boosting.
588
+ - **`topic` field is not included in the ChromaDB hard filter** — only `type` + `intent` are hard-anchored; `category` and `topic` are soft `$or` hints.
589
+
590
+ ### Performance Bottlenecks
591
+
592
+ - **Local LLM (LlamaCpp)** is CPU-only with `n_ctx=8096` and `n_threads=4`. Response latency is high (~10–30s) on low-RAM systems.
593
+ - **Classifier uses `SentenceTransformer` + `TF-IDF` features** — inference runs on every request with no caching of query embeddings.
594
+ - **BM25 corpus is rebuilt from scratch per request** — `BM25Retriever.from_documents()` is called inside `_bm25_rank()` each time.
595
+ - **`classify_test_dataset` in `app/utils/tests.py`** is a very large file (1.8MB) loaded at import time.
596
+ - **The memoized evaluation** in `rag_service.evaluation` is not thread-safe if the server runs with multiple workers.
597
+
598
+ ---
599
+
600
+ ## 8. File Structure
601
+
602
+ ```
603
+ VGEC-RAG-Chatbot/
604
+
605
+ ├── app/ # Application package
606
+ │ ├── main.py # FastAPI app, router mounting, CORS middleware
607
+ │ ├── core/
608
+ │ │ ├── config.py # Pydantic Settings (all tuneable params)
609
+ │ │ └── paths.py # Path constants helper
610
+ │ │
611
+ │ ├── api/
612
+ │ │ ├── dependencies.py # lru_cache singleton for RAGService
613
+ │ │ ├── routes/
614
+ │ │ │ ├── rag.py # /rag endpoints (query, test, classifier)
615
+ │ │ │ ├── vector_store.py # /vector endpoints (CRUD for ChromaDB)
616
+ │ │ │ └── settings.py # /settings endpoints
617
+ │ │ └── schemas/
618
+ │ │ ├── requests.py # RAGRequest, PaginationParams, etc.
619
+ │ │ └── tests.py # TestRequestSchema, TestClassifierReqSchema
620
+ │ │
621
+ │ ├── services/
622
+ │ │ ├── rag_service.py # RAGService (main orchestrator)
623
+ │ │ ├── hybrid_retrieval.py # HybridRetrievalService + RRF logic
624
+ │ │ ├── classifier_service.py # Classifier class + singleton clf
625
+ │ │ ├── ingestion_service.py # IngestionService (chunking pipeline)
626
+ │ │ ├── file_service.py # FileService (file I/O + metadata JSON)
627
+ │ │ ├── vector_store.py # VectorStore (thin ChromaDB wrapper)
628
+ │ │ ├── text_splitter.py # TextSplitter (RecursiveCharacter + variants)
629
+ │ │ └── document_loader.py # (legacy loader, not in primary path)
630
+ │ │
631
+ │ ├── utils/
632
+ │ │ ├���─ preprocessing.py # preprocess(), normalize(), preprocess_query()
633
+ │ │ ├── document_helpers.py # get_references_v2(), build_metadata(), create_documents()
634
+ │ │ ├── model_factory.py # get_llm_model(), get_embedding_model()
635
+ │ │ ├── constants.py # stopwords list, short_words_mappings
636
+ │ │ ├── embeddings.py # (thin embedding util)
637
+ │ │ ├── llm_models.py # (thin LLM util)
638
+ │ │ └── tests.py # classifier_test_dataset (large, 1.8MB)
639
+ │ │
640
+ │ └── prompts/
641
+ │ └── __init__.py # SYSTEM_PROMPT, wrap_exaone()
642
+
643
+ ├── ml_models/
644
+ │ ├── classifier/
645
+ │ │ └── chatbot_classifier.pkl # Pickled pipeline (models, tfidf, label encoders, etc.)
646
+ │ ├── embeddings/ # (Local embedding model weights, if any)
647
+ │ └── llm/
648
+ │ └── EXAONE-3.5-2.4B-*.gguf # Local LLM weights
649
+
650
+ ├── data/
651
+ │ ├── department_data/ # Source JSON files per department
652
+ │ │ ├── computer_eng.json
653
+ │ │ ├── civil.json
654
+ │ │ └── ...
655
+ │ ├── documents/ # Persistent copies of ingested files
656
+ │ ├── vector_stores/
657
+ │ │ └── classifier_test_1/ # ChromaDB persist directory
658
+ │ ├── classifier_test_1.json # Ingestion metadata registry (FileService records)
659
+ │ └── other_data/ # Misc data files
660
+
661
+ ├── temp/ # Staging area for uploaded files (auto-cleared)
662
+ ├── scripts/ # Offline scripts (training, testing)
663
+ ├── tests/ # Test files
664
+
665
+ ├── requirements.txt # Pinned production dependencies
666
+ ├── .env # Runtime secrets (google_api_key, etc.)
667
+ ├── .env.example # Template for .env
668
+ └── CODEBASE_DOCUMENTATION.md # This file
669
+ ```
670
+
671
+ ---
672
+
673
+ *End of documentation.*
College_Overview2.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vishwakarma Government Engineering College
3
+ source_url: https://www.vgecg.ac.in/index.php
4
+ domain: https://www.vgecg.ac.in
5
+ pathname: /index.php
6
+ visited: 2026-02-15T12:55:40.751Z
7
+ topic: College Overview
8
+ ---
9
+
10
+ # College Statistics
11
+
12
+ Description:
13
+ This page provides some key statistics about Vishwakarma Government Engineering College.
14
+
15
+ - **Publication:** 48046+
16
+ - **Research Labs:** 13+
17
+ - **Courses:** 12+
18
+ - **Highest Package (Lacs):** 23+
19
+
20
+ Source:
21
+ - https://www.vgecg.ac.in/index.php
22
+
23
+ Keywords:
24
+ - college statistics
25
+ - VGEC overview
26
+ - publications
27
+ - research labs
28
+ - courses
29
+ - placements
DOCUMENTATION_PLAN.md ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VGEC RAG Chatbot — Software Documentation Plan
2
+ > Based on IEEE/Industry Standard | Updated: 2026-03-25
3
+ > Reference: `CODEBASE_DOCUMENTATION.md` covers most of Phase 5 already — reuse it.
4
+
5
+ ---
6
+
7
+ ## DIAGRAMS FIRST — Priority Order
8
+
9
+ > Do all diagrams before writing any prose. Diagrams take the most time and are referenced throughout.
10
+
11
+ | # | Diagram | Phase Used In | Tool | Status |
12
+ |---|---|---|---|---|
13
+ | 1 | High-Level Architecture (Component Diagram) | Phase 5 | Draw.io / Mermaid | [ ] |
14
+ | 2 | Data Flow — Query Path | Phase 5 | Draw.io (DFD Level 2) | [ ] |
15
+ | 3 | Data Flow — Ingestion Path | Phase 5 | Draw.io (DFD Level 2) | [ ] |
16
+ | 4 | Hierarchical Taxonomy Tree (Type→Category→Topic) | Phase 5 | Tree diagram / Mermaid | [ ] |
17
+ | 5 | Filter Decision Flowchart (Strict→Partial→Fallback) | Phase 5 | Flowchart / Draw.io | [ ] |
18
+ | 6 | Hybrid Retrieval Sequence (Vector→BM25→RRF→Boost) | Phase 5 | Sequence diagram / Flow | [ ] |
19
+ | 7 | Use Case Diagram (Student, Faculty, Admin actors) | Phase 4 | Draw.io / PlantUML | [ ] |
20
+ | 8 | System Context Diagram / Level 0 DFD | Phase 2 | Draw.io | [ ] |
21
+ | 9 | Class Diagram (simplified — RAGService + helpers) | Phase 6 | Draw.io / UML | [ ] |
22
+ | 10 | Activity Diagram — Chunking Process | Phase 6 | Activity flow / Draw.io | [ ] |
23
+ | 11 | MRR Bar Chart — Your RAG vs Traditional | Phase 7 | matplotlib / Excel | [ ] |
24
+ | 12 | Noise Rate Bar Chart — Comparison | Phase 7 | matplotlib / Excel | [ ] |
25
+ | 13 | Classifier Confusion Matrix (per field) | Phase 7 | Seaborn heatmap | [ ] |
26
+ | 14 | Deployment Diagram (Express → FastAPI → ChromaDB) | Phase 8 | Draw.io | [ ] |
27
+ | 15 | Future Roadmap / Gantt-style Timeline | Phase 9 | Draw.io / simple table | [ ] |
28
+
29
+ ---
30
+
31
+ ## Phase 1 — Front Matter
32
+ **Est. time: 1–2 hrs | No diagrams needed**
33
+
34
+ - [ ] Title Page
35
+ - Project: VGEC RAG Chatbot
36
+ - Subtitle: Retrieval-Augmented Generation System for Academic Queries
37
+ - Name, Roll No., Department, Submission Date
38
+ - Guide name, College name
39
+ - [ ] Abstract (150–200 words)
40
+ - Problem: Students struggle to find accurate VGEC info scattered across website
41
+ - Solution: RAG-based chatbot with hierarchical classification + hybrid retrieval
42
+ - Key results: MRR, noise reduction *(fill placeholders after deployment)*
43
+ - Tech: FastAPI, ChromaDB, Gemini, Logistic Regression classifier
44
+ - [ ] Table of Contents *(auto-generate at end — structure it now)*
45
+ - [ ] List of Figures *(auto-generate at end)*
46
+ - [ ] List of Abbreviations
47
+ - RAG, BM25, RRF, LLM, MRR, API, VGEC, HOD, etc.
48
+
49
+ ---
50
+
51
+ ## Phase 2 — Introduction
52
+ **Est. time: 2–3 hrs | Diagrams needed: System Context Diagram (Diagram #8)**
53
+
54
+ - [ ] 2.1 Background
55
+ - Current state: Static website, PDFs, manual queries to admin office
56
+ - Pain points: Information scattered, no natural language interface
57
+ - [ ] 2.2 Problem Statement
58
+ - Lack of intelligent query system for institutional data
59
+ - Need for domain-specific (VGEC) accurate retrieval
60
+ - [ ] 2.3 Objectives
61
+ - Build RAG pipeline with >75% MRR
62
+ - Implement metadata classification for pre-filtering
63
+ - Provide REST API for frontend integration
64
+ - Deploy with a secure Express gateway
65
+ - [ ] 2.4 Scope
66
+ - **In scope:** Department data (faculty, labs, syllabus, HOD, intake), REST API, classification, evaluation
67
+ - **Out of scope:** Real-time website scraping, admissions processing, multimedia
68
+
69
+ > **Reuse from:** `CODEBASE_DOCUMENTATION.md` Section 1 (Project Overview)
70
+
71
+ ---
72
+
73
+ ## Phase 3 — Literature Review / Related Work
74
+ **Est. time: 2–3 hrs | Diagrams needed: Evolution timeline (simple horizontal flow)**
75
+
76
+ - [ ] 3.1 Traditional Chatbots
77
+ - Rule-based (ALICE, ELIZA) — rigid, no context
78
+ - Keyword matching chatbots — no semantic understanding
79
+ - [ ] 3.2 Modern RAG Systems
80
+ - OpenAI GPT-4 + vector DB (generic, not domain-specific)
81
+ - LlamaIndex / LangChain baseline RAG — no metadata filtering
82
+ - [ ] 3.3 Hybrid Search Systems
83
+ - Elasticsearch (BM25 only), Cohere (vector only)
84
+ - RRF as the standard fusion method (reference paper)
85
+ - [ ] 3.4 Your Differentiation
86
+ - Hierarchical classifier (Type→Category→Topic→Intent) for pre-filtering
87
+ - Hybrid retrieval (BM25 + Vector + RRF) vs pure semantic search
88
+ - Domain-specific ingestion strategy (intent-aware JSON chunking)
89
+
90
+ ---
91
+
92
+ ## Phase 4 — System Analysis & Requirements
93
+ **Est. time: 3–4 hrs | Diagrams needed: Use Case Diagram (#7), Level 1 DFD**
94
+
95
+ - [ ] 4.1 Functional Requirements
96
+ - FR1: Ingest structured JSON and unstructured documents (PDF, MD, TXT)
97
+ - FR2: Classify queries into metadata filters (type, category, topic, intent)
98
+ - FR3: Retrieve relevant chunks with configurable similarity threshold
99
+ - FR4: Generate contextual answers using Gemini or local LLM
100
+ - FR5: Provide CRUD operations on vector store via REST API
101
+ - FR6: Rate-limit and authenticate requests via Express gateway
102
+ - [ ] 4.2 Non-Functional Requirements
103
+ - Performance: <5s response (cloud), <30s (local LLM)
104
+ - Accuracy: MRR >0.75
105
+ - Security: Admin routes protected by JWT, Python API never publicly exposed
106
+ - Scalability: Support 10,000+ chunks in ChromaDB
107
+ - [ ] 4.3 Use Case Diagram *(Diagram #7)*
108
+ - Actors: Student, Faculty, Admin
109
+ - Student use cases: Submit query, View answer, View references
110
+ - Admin use cases: Ingest document, Delete document, Run evaluation, Change settings
111
+ - [ ] 4.4 Level 1 DFD
112
+ - Major processes: Ingest, Classify, Retrieve, Generate, Evaluate
113
+
114
+ ---
115
+
116
+ ## Phase 5 — System Design
117
+ **Est. time: 4–6 hrs | MOST MARKS, MOST DIAGRAMS**
118
+ **Diagrams needed: #1, #2, #3, #4, #5, #6**
119
+
120
+ > **Reuse heavily from:** `CODEBASE_DOCUMENTATION.md` Sections 2, 3, 4
121
+
122
+ - [ ] 5.1 Architecture Design
123
+ - [ ] High-Level Component Diagram *(Diagram #1)*
124
+ - [ ] Data Flow — Ingestion Path *(Diagram #3)*
125
+ - [ ] Data Flow — Query Path *(Diagram #2)*
126
+ - [ ] Technology Stack Table (already in CODEBASE_DOCUMENTATION.md Section 1)
127
+ - [ ] 5.2 Database Design
128
+ - [ ] Vector DB Metadata Schema (field table — already in CODEBASE_DOCUMENTATION.md Section 3)
129
+ - [ ] Source JSON Schema (already documented)
130
+ - [ ] File Tracking Registry Schema (FileService JSON records)
131
+ - [ ] 5.3 Algorithm Design
132
+ - [ ] Hierarchical Taxonomy Tree *(Diagram #4)* (Type → Category → Topic → Intent)
133
+ - [ ] Filter Decision Flowchart *(Diagram #5)* (confidence thresholds → Strict/Partial/Fallback)
134
+ - [ ] Hybrid Retrieval Sequence *(Diagram #6)* (Vector → BM25 → RRF formula → Boost → Threshold)
135
+ - [ ] Chunking Strategy (JSON intent-aware vs RecursiveCharacterTextSplitter)
136
+ - [ ] RRF Formula — document with the actual equation:
137
+ ```
138
+ score(d) = bm25_weight * 1/(rrf_k + rank_bm25)
139
+ + vector_weight * 1/(rrf_k + rank_vec)
140
+ ```
141
+ - [ ] 5.4 Interface Design
142
+ - [ ] API Endpoint Table — /rag and /vector routes (already in CODEBASE_DOCUMENTATION.md Section 5)
143
+ - [ ] Request/Response JSON examples (sample curl or Postman output)
144
+ - [ ] Express Gateway design (rate limit + auth + concurrency queue)
145
+
146
+ ---
147
+
148
+ ## Phase 6 — Implementation
149
+ **Est. time: 2–3 hrs | Diagrams needed: Directory tree (#9 class diagram, #10 activity diagram)**
150
+
151
+ > **Reuse heavily from:** `CODEBASE_DOCUMENTATION.md` Section 5 and Section 8
152
+
153
+ - [ ] 6.1 Directory Structure (already in CODEBASE_DOCUMENTATION.md Section 8)
154
+ - [ ] 6.2 Module Descriptions (already in CODEBASE_DOCUMENTATION.md Section 5)
155
+ - [ ] 6.3 Key Code Snippets *(do NOT paste full files — only algorithm excerpts)*
156
+ - [ ] Filter construction logic (`_build_filter` method)
157
+ - [ ] RRF scoring loop
158
+ - [ ] Intent-aware JSON chunking (`handle_json_docs`)
159
+ - [ ] Classifier prediction + threshold gating
160
+ - [ ] 6.4 Configuration
161
+ - [ ] `.env` variables table (already in CODEBASE_DOCUMENTATION.md Section 5)
162
+ - [ ] Hyperparameter table (BM25 weights, thresholds, chunk size)
163
+ - [ ] 6.5 Express Gateway Implementation
164
+ - [ ] Rate limiting configuration
165
+ - [ ] JWT auth middleware snippet
166
+ - [ ] Concurrency queue (`p-limit`) snippet
167
+
168
+ ---
169
+
170
+ ## Phase 7 — Testing & Evaluation
171
+ **Est. time: 3–4 hrs | Diagrams needed: #11 (MRR bar chart), #12 (noise chart), #13 (confusion matrix)**
172
+ > ⚠️ PLACEHOLDER — fill real numbers and screenshots AFTER deployment
173
+
174
+ - [ ] 7.1 Test Plan
175
+ - [ ] Unit tests: Classifier accuracy per field (run `/test_classifier_dataset`)
176
+ - [ ] Integration tests: End-to-end hybrid query
177
+ - [ ] Performance: Measure average latency (cloud vs local)
178
+ - [ ] 7.2 Results
179
+ - [ ] Comparison Table: Traditional pure-vector RAG vs Your Hybrid RAG
180
+ - Metrics: MRR, Hit Rate, Top-1 Hit Rate, Noise Rate, Latency
181
+ - [ ] MRR Bar Chart by query intent type *(Diagram #11)*
182
+ - [ ] Noise Rate comparison *(Diagram #12)*
183
+ - [ ] Classifier Confusion Matrix per field *(Diagram #13)*
184
+ - [ ] 7.3 Sample Query Demonstrations
185
+ - Choose 3–5 representative queries, show:
186
+ - Input question
187
+ - Classifier output (type, category, topic, intent + confidences)
188
+ - Retrieved chunks with scores
189
+ - Final LLM answer
190
+
191
+ ---
192
+
193
+ ## Phase 8 — Deployment
194
+ **Est. time: 1–2 hrs | Diagrams needed: Deployment diagram (#14)**
195
+ > ⚠️ PLACEHOLDER — fill AFTER actual deployment
196
+
197
+ - [ ] 8.1 System Requirements
198
+ - Hardware: 8GB RAM, 4-core CPU (local LLM) OR Google API key (Gemini)
199
+ - Software: Python 3.9+, Node.js 18+, ChromaDB
200
+ - [ ] 8.2 Deployment Architecture *(Diagram #14)*
201
+ - Frontend → Express Gateway → FastAPI → ChromaDB
202
+ - [ ] 8.3 Installation Steps
203
+ - Clone → `pip install -r requirements.txt` → Set `.env` → Run ingestion → Start API
204
+ - Express: `npm install` → Set `.env` → `node server.js`
205
+ - [ ] 8.4 Screenshots *(fill after deployment)*
206
+ - [ ] Swagger UI (`/docs`)
207
+ - [ ] Sample chatbot interaction
208
+ - [ ] Admin panel
209
+ - [ ] Classification test panel
210
+
211
+ ---
212
+
213
+ ## Phase 9 — Future Scope & Conclusion
214
+ **Est. time: 1–2 hrs | Diagrams needed: Roadmap (#15)**
215
+
216
+ - [ ] 9.1 Future Enhancements
217
+ - Dynamic LLM switching via admin UI (ModelManager architecture)
218
+ - Cross-encoder re-ranking step (after resource becomes available)
219
+ - Query result caching layer
220
+ - Automated metadata prediction during ingestion (classifier-assisted)
221
+ - Website scraping for real-time data updates
222
+ - [ ] 9.2 Known Limitations (already in CODEBASE_DOCUMENTATION.md Section 7)
223
+ - Local LLM latency (CPU-bound, no GPU)
224
+ - BM25 corpus rebuilt per request
225
+ - No real-time data — static knowledge base
226
+ - [ ] 9.3 Conclusion
227
+ - Successfully built domain-specific RAG with hybrid retrieval
228
+ - Hierarchical classification reduces noise and improves precision
229
+ - Secure deployment with Express gateway protects the inference server
230
+
231
+ ---
232
+
233
+ ## Phase 10 — References & Appendices
234
+ **Est. time: 1–2 hrs | No diagrams needed**
235
+
236
+ - [ ] 10.1 References
237
+ - LangChain documentation
238
+ - ChromaDB documentation
239
+ - Original RRF paper (Cormack et al., 2009)
240
+ - Gemini API documentation
241
+ - VGEC official website (data source)
242
+ - BM25 (Robertson & Zaragoza, 2009)
243
+ - Sentence Transformers (Reimers & Gurevych, 2019)
244
+ - [ ] 10.2 Appendix A — MASTER_INDEX full taxonomy
245
+ - [ ] 10.3 Appendix B — Full API documentation (export from Swagger `/docs`)
246
+ - [ ] 10.4 Appendix C — Sample classifier training data
247
+ - [ ] 10.5 Appendix D — Sample department JSON format
248
+
249
+ ---
250
+
251
+ ## Execution Timeline
252
+
253
+ | Phase | When | Priority |
254
+ |---|---|---|
255
+ | **All Diagrams** | Start NOW (before writing prose) | 🔴 Critical |
256
+ | Phase 1–3 (Intro, Lit Review) | Day 1 | Must have |
257
+ | Phase 4–5 (Design) | Day 2–3 | 🔴 Critical — most marks |
258
+ | Phase 6 (Implementation) | Day 4 | Must have |
259
+ | Phase 7 (Testing) | After deployment — Day 5 | 🔴 Critical — proof |
260
+ | Phase 8 (Deployment) | After deployment | Must have |
261
+ | Phase 9–10 (Future, Refs) | Day 6 | Finish strong |
262
+ | Final PDF export + proofread | Last | Required |
263
+
264
+ ---
265
+
266
+ ## Reuse Map — What's Already Written
267
+
268
+ | Documentation Section | Already in |
269
+ |---|---|
270
+ | System Architecture (components, data flow) | `CODEBASE_DOCUMENTATION.md` Section 2 |
271
+ | Tech Stack Table | `CODEBASE_DOCUMENTATION.md` Section 1 |
272
+ | Metadata Schema / Taxonomy | `CODEBASE_DOCUMENTATION.md` Section 3 |
273
+ | Retrieval Pipeline steps | `CODEBASE_DOCUMENTATION.md` Section 4 |
274
+ | All class/method descriptions | `CODEBASE_DOCUMENTATION.md` Section 5 |
275
+ | Metrics definitions | `CODEBASE_DOCUMENTATION.md` Section 6 |
276
+ | Known Limitations | `CODEBASE_DOCUMENTATION.md` Section 7 |
277
+ | File Structure Tree | `CODEBASE_DOCUMENTATION.md` Section 8 |
Dockerfile ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ─────────────────────────────────────────────────────────────────────────────
2
+ # VGEC RAG Chatbot — Dockerfile for Hugging Face Spaces
3
+ # ─────────────────────────────────────────────────────────────────────────────
4
+ # HF Spaces requirements:
5
+ # • Port MUST be 7860
6
+ # • GOOGLE_API_KEY must be set as a Space Secret in HF UI
7
+ # ─────────────────────────────────────────────────────────────────────────────
8
+
9
+ FROM python:3.11-slim
10
+
11
+ # ── System dependencies ───────────────────────────────────────────────────────
12
+ # build-essential → needed by chromadb (hnswlib C extension)
13
+ # libgomp1 → needed by sentence-transformers / scikit-learn OpenMP
14
+ # git → needed by some pip packages that install from git
15
+ RUN apt-get update && apt-get install -y --no-install-recommends \
16
+ build-essential \
17
+ libgomp1 \
18
+ git \
19
+ && rm -rf /var/lib/apt/lists/*
20
+
21
+ # ── Working directory ─────────────────────────────────────────────────────────
22
+ WORKDIR /app
23
+
24
+ # ── Python dependencies ───────────────────────────────────────────────────────
25
+ # Copy requirements first so Docker caches this layer separately from source code.
26
+ # Any requirements change rebuilds from here; source code changes don't.
27
+ COPY requirements.txt .
28
+
29
+ # Install CPU-only PyTorch FIRST (prevents pip from pulling 2+ GB GPU wheels
30
+ # when sentence-transformers later requests torch as a dependency).
31
+ RUN pip install --no-cache-dir \
32
+ torch==2.2.2 \
33
+ --index-url https://download.pytorch.org/whl/cpu
34
+
35
+ # Install the rest of the requirements.
36
+ # llama-cpp-python is intentionally excluded — Gemini-only deployment.
37
+ RUN pip install --no-cache-dir -r requirements.txt
38
+
39
+ # Download the spaCy English model at build time so it's baked into the image.
40
+ RUN python -m spacy download en_core_web_sm
41
+
42
+ # ── Application source ────────────────────────────────────────────────────────
43
+ COPY . .
44
+
45
+ # ── Environment variables ─────────────────────────────────────────────────────
46
+ # Tell Python not to buffer stdout/stderr (so logs appear in real time on HF).
47
+ ENV PYTHONUNBUFFERED=1
48
+ ENV PYTHONDONTWRITEBYTECODE=1
49
+
50
+ # LLM mode — overrides the config.py default; HF Spaces will use Gemini API.
51
+ # GOOGLE_API_KEY is NOT set here — it must be added as a HF Space Secret.
52
+ ENV LLM_PROVIDER=gemini
53
+ ENV ENABLE_FALLBACK=false
54
+
55
+ # Point sentence-transformers cache inside /app so it's predictable.
56
+ ENV SENTENCE_TRANSFORMERS_HOME=/app/ml_models/embeddings
57
+ ENV HF_HOME=/app/.cache/huggingface
58
+
59
+ # ── Port ──────────────────────────────────────────────────────────────────────
60
+ # HF Spaces requires exactly port 7860.
61
+ EXPOSE 7860
62
+
63
+ # ── Startup ───────────────────────────────────────────────────────────────────
64
+ # No --reload (dev-only flag).
65
+ # --workers 1 keeps RAM usage predictable on the free tier (2 vCPU, 16 GB RAM).
66
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
LOCAL_MODEL_TRUNCATION_FIX.md ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local Model Truncation Fix
2
+
3
+ ## 🐛 Problem
4
+
5
+ The local model was cutting off responses mid-sentence, like:
6
+
7
+ ```
8
+ "...applications for various purposes such as"
9
+ ```
10
+
11
+ ## 🔍 Root Cause
12
+
13
+ The `ChatLlamaCpp` model configuration was **missing the `max_tokens` parameter**.
14
+
15
+ Without this parameter:
16
+
17
+ - The model defaults to a very low token generation limit
18
+ - Responses get truncated mid-sentence
19
+ - No warning or error is shown
20
+
21
+ ## ✅ Solution Applied
22
+
23
+ ### 1. Added `max_tokens` to Local Model Configuration
24
+
25
+ **File:** `app/utils/model_factory.py`
26
+
27
+ **Before:**
28
+
29
+ ```python
30
+ model = ChatLlamaCpp(
31
+ model_path=str(model_file),
32
+ n_ctx=4096,
33
+ n_batch=512,
34
+ n_threads=4,
35
+ temperature=0.05,
36
+ # ❌ Missing max_tokens!
37
+ )
38
+ ```
39
+
40
+ **After:**
41
+
42
+ ```python
43
+ model = ChatLlamaCpp(
44
+ model_path=str(model_file),
45
+ n_ctx=4096,
46
+ n_batch=512,
47
+ n_threads=4,
48
+ max_tokens=settings.local_max_tokens, # ✅ FIXED!
49
+ temperature=0.05,
50
+ )
51
+ ```
52
+
53
+ ### 2. Increased Gemini Token Limit
54
+
55
+ **Before:** `max_output_tokens=512` (too low)
56
+ **After:** `max_output_tokens=settings.max_output_tokens` (2048)
57
+
58
+ ### 3. Made Settings Configurable
59
+
60
+ **File:** `app/core/config.py`
61
+
62
+ Added:
63
+
64
+ ```python
65
+ # Generation Settings
66
+ max_output_tokens: int = 2048 # Max tokens for Gemini responses
67
+ local_max_tokens: int = 2048 # Max tokens for local model responses
68
+ ```
69
+
70
+ ## 📊 Impact
71
+
72
+ ### Before:
73
+
74
+ - **Gemini**: 512 max tokens (~350-400 words)
75
+ - **Local**: Unknown (probably ~100-200 tokens)
76
+ - **Result**: Truncated responses
77
+
78
+ ### After:
79
+
80
+ - **Gemini**: 2048 max tokens (~1400-1500 words)
81
+ - **Local**: 2048 max tokens (~1400-1500 words)
82
+ - **Result**: Complete, full responses ✅
83
+
84
+ ## 🎯 Expected Behavior Now
85
+
86
+ 1. **Local model should complete sentences** instead of cutting off
87
+ 2. **Responses can be up to ~1500 words** before hitting the limit
88
+ 3. **Both models have equal response length capacity**
89
+
90
+ ## ⚙️ How to Adjust
91
+
92
+ If you want even longer responses, edit `app/core/config.py`:
93
+
94
+ ```python
95
+ # For longer responses (up to ~3500 words)
96
+ max_output_tokens: int = 4096
97
+ local_max_tokens: int = 4096
98
+
99
+ # For shorter responses (to save processing time)
100
+ max_output_tokens: int = 1024
101
+ local_max_tokens: int = 1024
102
+ ```
103
+
104
+ ## 🧪 Test It
105
+
106
+ Try asking the same question again. The local model should now:
107
+
108
+ 1. ✅ Complete full sentences
109
+ 2. ✅ Provide detailed answers
110
+ 3. ✅ Not cut off mid-word
111
+
112
+ ## 📝 Additional Notes
113
+
114
+ ### Why 2048 tokens?
115
+
116
+ - Good balance between completeness and speed
117
+ - Covers most Q&A scenarios
118
+ - Prevents overly long responses
119
+
120
+ ### What is a "token"?
121
+
122
+ - A token ≈ 0.75 words on average
123
+ - 2048 tokens ≈ 1500 words
124
+ - 4096 tokens ≈ 3000 words
125
+
126
+ ### Parameters Explained:
127
+
128
+ - `n_ctx=4096`: Total context window (input + output)
129
+ - `max_tokens=2048`: Maximum output only
130
+ - This means: max ~2048 input + 2048 output = 4096 total
131
+
132
+ ### Other Fixes Applied:
133
+
134
+ - Added comments to all parameters for clarity
135
+ - Made token limits configurable via settings
136
+ - Ensured both models have consistent behavior
MARKDOWN_FIX_SUMMARY.md ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Markdown Splitting Fix - Summary
2
+
3
+ ## Problem
4
+
5
+ The markdown files with `---` section delimiters were being split at every `#` header, creating many small chunks with insufficient context.
6
+
7
+ ### Example Issue:
8
+
9
+ ```
10
+ # Faculty of the Information & Communication Technology Department
11
+ ```
12
+
13
+ This header alone was becoming a separate chunk because the default markdown splitter splits on headers.
14
+
15
+ ## Solution Implemented
16
+
17
+ ### 1. Created New Splitter Method: `for_markdown_with_sections()`
18
+
19
+ **Location:** `app/services/text_splitter.py`
20
+
21
+ **Custom Separators Priority:**
22
+
23
+ 1. `\n---\n` - Section delimiters (HIGHEST PRIORITY)
24
+ 2. `\n\n\n` - Triple newlines
25
+ 3. `\n\n` - Paragraphs
26
+ 4. `\n` - Single newlines
27
+ 5. `. ` - Sentences
28
+ 6. ` ` - Words
29
+ 7. `` - Characters (last resort)
30
+
31
+ This ensures sections stay together and headers aren't split separately.
32
+
33
+ ### 2. Updated RAG Service
34
+
35
+ **Location:** `app/services/rag_service.py` (line 77-82)
36
+
37
+ **Changed from:**
38
+
39
+ ```python
40
+ markdown_splitter = self.text_splitter.for_markdown(
41
+ chunk_size=chunk_size,
42
+ chunk_overlap=chunk_overlap
43
+ )
44
+ ```
45
+
46
+ **Changed to:**
47
+
48
+ ```python
49
+ markdown_splitter = TextSplitter.for_markdown_with_sections(
50
+ chunk_size=chunk_size,
51
+ chunk_overlap=chunk_overlap
52
+ )
53
+ ```
54
+
55
+ ### 3. Updated Document Helpers
56
+
57
+ **Location:** `app/utils/document_helpers.py` (line 161-167)
58
+
59
+ Added auto-detection for markdown with sections:
60
+
61
+ ```python
62
+ # Use section-aware splitter if text contains markdown section delimiters
63
+ if "\n---\n" in text or text.startswith("---\n"):
64
+ splitter = TextSplitter.for_markdown_with_sections()
65
+ else:
66
+ splitter = TextSplitter()
67
+ ```
68
+
69
+ ## Expected Results
70
+
71
+ ### Before (with `for_markdown()`):
72
+
73
+ - **Many small chunks** - Headers split separately
74
+ - Example: "# Faculty..." becomes its own 50-character chunk
75
+ - Poor context for RAG retrieval
76
+
77
+ ### After (with `for_markdown_with_sections()`):
78
+
79
+ - **Fewer, more meaningful chunks** - Sections kept together
80
+ - Headers stay with their content
81
+ - Better context for RAG retrieval
82
+ - Reduced number of chunks overall
83
+
84
+ ## How to Use
85
+
86
+ ### For File Upload (Already Applied):
87
+
88
+ When you upload a `.md` file via the POST endpoint, it will automatically:
89
+
90
+ 1. Detect it's a markdown file
91
+ 2. Use `for_markdown_with_sections()` splitter
92
+ 3. Keep sections together
93
+
94
+ ### For Raw Text Upload:
95
+
96
+ When posting raw text with `---` delimiters:
97
+
98
+ 1. The system auto-detects section delimiters
99
+ 2. Applies the section-aware splitter
100
+ 3. Preserves semantic structure
101
+
102
+ ## Configuration
103
+
104
+ You can still adjust chunk size in `app/core/config.py`:
105
+
106
+ ```python
107
+ chunk_size: int = 768 # Adjust as needed
108
+ chunk_overlap: int = 200 # Adjust overlap
109
+ ```
110
+
111
+ ## Next Steps
112
+
113
+ Try uploading your markdown file again. You should see:
114
+
115
+ - ✅ Fewer total chunks
116
+ - ✅ Each chunk contains header + related content
117
+ - ✅ Better semantic coherence
118
+ - ✅ Improved RAG retrieval quality
README.md CHANGED
@@ -1,10 +1,65 @@
1
- ---
2
- title: Vgecbot
3
- emoji: 🦀
4
- colorFrom: pink
5
- colorTo: red
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG (Retrieval-Augmented Generation) Project
2
+
3
+ ## Services
4
+
5
+ ### Available Services
6
+
7
+ 1. **Document Loader** (`services.document_loader`)
8
+ - Load PDF documents
9
+ - Support for single and multiple file loading
10
+ - Lazy loading support
11
+
12
+ 2. **Vector Store** (`services.VectorStore`)
13
+ - Similarity search
14
+ - Document management (add, update, delete)
15
+ - Metadata filtering
16
+
17
+ 3. **Text Splitter** (`services.TextSplitter`) ✅
18
+ - Recursive character text splitting
19
+ - Language-specific splitting (20+ languages)
20
+ - See [docs/TEXT_SPLITTER.md](docs/TEXT_SPLITTER.md) for full documentation
21
+
22
+ 4. **RAG Service** (`services.RAGService`) ✅ **NEW**
23
+ - Integrates Document Loader, Text Splitter, Vector Store
24
+ - Powered by **Google Gemini** LLM
25
+ - Creates a complete RAG pipeline with retrieval & generation
26
+
27
+ ## Quick Start
28
+
29
+ ```python
30
+ from services import document_loader, TextSplitter, VectorStore
31
+ from libs import ROOT_PATH
32
+
33
+ # Load documents
34
+ pdf_path = ROOT_PATH / "document.pdf"
35
+ doc_obj = document_loader(filepath=pdf_path)
36
+ documents = doc_obj.load()
37
+
38
+ # Split into chunks
39
+ splitter = TextSplitter(chunk_size=1000, chunk_overlap=200)
40
+ chunks = splitter.split_documents(documents)
41
+
42
+ # Add to vector store
43
+ # vector_store.add_documents(chunks)
44
+ ```
45
+
46
+ ## Examples
47
+
48
+ Run the TextSplitter examples:
49
+
50
+ ```bash
51
+ python examples_text_splitter.py
52
+ ```
53
+
54
+ ## Tasks
55
+
56
+ - [x] Document Loader
57
+ - [ ] Multiple PDF loader
58
+ - [ ] if txt then txt loader
59
+ - [ ] preprocessing
60
+ - [ ] stop_words removal
61
+ - [ ] punctuations
62
+ - [ ] lowercasing
63
+ - [ ] lemmetization
64
+ - [x] Recursive TextSplitter ✅
65
+ - [ ] Assign Them Metadata properly!
WHY_LOCAL_NOT_WORKING.md ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Why Local Model Isn't Working - Diagnosis
2
+
3
+ ## 🐛 Problems Found:
4
+
5
+ ### 1. **LRU Cache Keeps Old Model** (PRIMARY ISSUE)
6
+
7
+ **File:** `app/api/dependencies.py` (line 13)
8
+
9
+ ```python
10
+ @lru_cache() # ❌ This caches the RAG service FOREVER
11
+ def get_rag_service() -> RAGService:
12
+ llm_model = get_llm_model() # Model initialized ONCE
13
+ ...
14
+ ```
15
+
16
+ **Impact:**
17
+
18
+ - Model is loaded when server FIRST starts
19
+ - Even if you change config, the OLD model stays in memory
20
+ - `@lru_cache()` never clears until server is fully restarted
21
+ - Auto-reload doesn't clear the cache!
22
+
23
+ ### 2. **Missing Fallback Trigger**
24
+
25
+ **File:** `app/utils/model_factory.py`
26
+
27
+ The fallback logic EXISTS but it's not being triggered because:
28
+
29
+ - The Gemini model initialization happens at startup (cached)
30
+ - The error happens during model.invoke() (at query time)
31
+ - But fallback only works during get_llm_model() (at init time)
32
+
33
+ ### 3. **Missing max_output_tokens** (FIXED)
34
+
35
+ You deleted it from config.py which caused AttributeError.
36
+ ✅ I restored it.
37
+
38
+ ## ✅ Solutions:
39
+
40
+ ### **Quick Fix: Full Server Restart**
41
+
42
+ Stop the server completely (Ctrl+C) and start it again:
43
+
44
+ ```bash
45
+ # Kill the server
46
+ Ctrl + C
47
+
48
+ # Restart
49
+ uvicorn main:app --reload
50
+ ```
51
+
52
+ This will clear the LRU cache and load the local model.
53
+
54
+ ### **Permanent Fix: Remove or Fix LRU Cache**
55
+
56
+ You have 2 options:
57
+
58
+ #### Option A: Remove LRU Cache (Simplest)
59
+
60
+ Models will be reinitialized on each request (slightly slower but settings-aware):
61
+
62
+ ```python
63
+ # Remove @lru_cache()
64
+ def get_rag_service() -> RAGService:
65
+ logger.info("Initializing RAG service...")
66
+ llm_model = get_llm_model()
67
+ ...
68
+ ```
69
+
70
+ #### Option B: Make Cache Settings-Aware
71
+
72
+ Cache based on current settings:
73
+
74
+ ```python
75
+ def get_rag_service_key():
76
+ return (settings.llm_provider, settings.gemini_model_name, settings.local_model_name)
77
+
78
+ @lru_cache(maxsize=2)
79
+ def _cached_llm_model(provider, gemini_name, local_name):
80
+ return get_llm_model(provider)
81
+
82
+ def get_rag_service() -> RAGService:
83
+ key = get_rag_service_key()
84
+ llm_model = _cached_llm_model(*key)
85
+ ...
86
+ ```
87
+
88
+ ### **Better Fix: Dynamic Model Loading**
89
+
90
+ Make the RAG service check settings on each request and switch models if needed.
91
+
92
+ ## 📋 Action Items:
93
+
94
+ 1. ✅ **Fixed:** Restored `max_output_tokens` in config.py
95
+ 2. ⚠️ **TODO:** Full server restart (Ctrl+C then restart)
96
+ 3. ⚠️ **TODO:** Test with local model
97
+ 4. ⚠️ **TODO:** Consider removing `@lru_cache()` from dependencies.py
98
+
99
+ ## What's Happening Now:
100
+
101
+ Right now, your server has:
102
+
103
+ - ✅ config.py says `llm_provider = "local"`
104
+ - ✅ max_output_tokens restored
105
+ - ❌ BUT old Gemini model still in memory (cached)
106
+ - ❌ Fallback can't help because model is already loaded
107
+
108
+ **The cached Gemini model is still being used for all requests!**
109
+
110
+ ## 🎯 Next Step:
111
+
112
+ **RESTART THE SERVER** (full stop + start, not just reload)
app/__init__.py ADDED
File without changes
app/api/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # We removed the top-level imports of routes here to prevent circular dependencies.
2
+ # This file now only provides the base structure if needed.
3
+
4
+ # If you want to use the api_router elsewhere, import it and register routes
5
+ # in the file where you initialize the FastAPI app (main.py).
app/api/dependencies.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ FastAPI dependencies for dependency injection.
4
+ """
5
+ from functools import lru_cache
6
+ # from app.services.rag_service import RAGService # MOVED INSIDE FUNCTION TO PREVENT CIRCULAR IMPORT
7
+ from app.utils.model_factory import get_llm_model, get_embedding_model, get_local_model
8
+ from app.core.config import settings
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ @lru_cache()
14
+ def get_rag_service():
15
+ from app.services.rag_service import RAGService
16
+ """
17
+ Get RAG service instance (singleton).
18
+
19
+ This is cached so the same instance is reused across requests.
20
+ Models are initialized once and reused.
21
+
22
+ Returns:
23
+ RAGService: Configured RAG service
24
+ """
25
+ logger.info("Initializing RAG service...")
26
+
27
+ # Initialize models
28
+ llm_model = get_llm_model()
29
+ embedding_model = get_embedding_model()
30
+
31
+ # Create RAG service
32
+ rag_service = RAGService(
33
+ model=llm_model,
34
+ collection_name=settings.collection_name,
35
+ persist_directory=settings.persist_directory,
36
+ embedding_model=embedding_model,
37
+ k=settings.similarity_top_k
38
+ )
39
+
40
+ logger.info("RAG service initialized successfully")
41
+ return rag_service
app/api/routes/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .rag import router as rag_router
2
+ from .vector_store import router as vector_router
3
+ from .settings import router as settings_router
4
+
5
+ __all__ = ["rag_router", "vector_router", "settings_router"]
app/api/routes/rag.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends, HTTPException
2
+ from app.api.schemas.requests import RAGRequest
3
+ from app.api.dependencies import get_rag_service
4
+ from app.api.schemas.tests import TestRequestSchema, TestClassifierReqSchema
5
+ from app.services.rag_service import RAGService
6
+ from app.utils.tests import classifier_test_dataset
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+ router = APIRouter()
11
+
12
+ @router.get("/")
13
+ def health_check():
14
+ """Health check endpoint."""
15
+ return {"status": "healthy", "service": "RAG"}
16
+
17
+ @router.post("/")
18
+ def query_rag(
19
+ request: RAGRequest,
20
+ rag_service: RAGService = Depends(get_rag_service) # ✅ Dependency injection!
21
+ ):
22
+ """
23
+ Query the RAG system with a question.
24
+
25
+ Args:
26
+ request: RAG request with question and history
27
+ rag_service: Injected RAG service instance
28
+
29
+ Returns:
30
+ Answer with references and metadata
31
+ """
32
+ try:
33
+ response = rag_service.query(
34
+ question=request.question,
35
+ history=request.history or [],
36
+ k=request.k,
37
+ threshold=request.threshold,
38
+ include_llm_response=request.include_llm_response
39
+ )
40
+ return response
41
+ except Exception as e:
42
+ logger.error(f"RAG query failed: {e}")
43
+ raise HTTPException(status_code=500, detail=str(e))
44
+
45
+ @router.post("/hybrid_query")
46
+ def hybrid_query(
47
+ request: RAGRequest,
48
+ rag_service: RAGService = Depends(get_rag_service) # ✅ Dependency injection!
49
+ ):
50
+ """
51
+ Query the RAG system with a question.
52
+
53
+ Args:
54
+ request: RAG request with question and history
55
+ rag_service: Injected RAG service instance
56
+
57
+ Returns:
58
+ Answer with references and metadata
59
+ """
60
+ try:
61
+ response = rag_service.hybrid_query(
62
+ question=request.question,
63
+ history=request.history or [],
64
+ k=request.k,
65
+ threshold=request.threshold,
66
+ include_llm_response=request.include_llm_response
67
+ )
68
+ return response
69
+ except Exception as e:
70
+ logger.error(f"RAG query failed: {e}")
71
+ raise HTTPException(status_code=500, detail=str(e))
72
+
73
+ @router.post("/test")
74
+ def test_queries(
75
+ request: TestRequestSchema,
76
+ query_delay: float = 1.0, # seconds between queries (Gemini 100 RPM limit)
77
+ rag_service: RAGService = Depends(get_rag_service)
78
+ ):
79
+ """
80
+ Run batch retrieval evaluation.
81
+ - query_delay: sleep between queries to respect Gemini embedding rate limit.
82
+ Free tier = 100 RPM → 1.0s delay safe for up to 150 queries (~2.5 min).
83
+ Set to 0.0 to disable (only if you have a paid API key).
84
+ """
85
+ try:
86
+ response = rag_service.test_queries(
87
+ tests=request,
88
+ query_delay=query_delay
89
+ )
90
+ return response
91
+ except Exception as e:
92
+ logger.error(f"Test Execution failed: {e}")
93
+ raise HTTPException(status_code=500, detail=str(e))
94
+
95
+
96
+ @router.post("/test_classifier")
97
+ def test_classifier(
98
+ request: TestClassifierReqSchema,
99
+ rag_service: RAGService = Depends(get_rag_service)
100
+ ):
101
+ try:
102
+ if(request.tests is None):
103
+ raise HTTPException(status_code=400, detail="No tests provided")
104
+ response = rag_service.test_classifier(
105
+ tests=request
106
+ )
107
+ return response
108
+ except Exception as e:
109
+ logger.error(f"Test classifier Execution failed: {e}")
110
+ raise HTTPException(status_code=500, detail=str(e))
111
+
112
+
113
+ @router.get("/test_classifier_dataset")
114
+ def test_classifier_dataset(
115
+ rag_service: RAGService = Depends(get_rag_service)
116
+ ):
117
+ try:
118
+ if(len(rag_service.evaluation.keys()) > 0):
119
+ return rag_service.evaluation
120
+
121
+ req = TestClassifierReqSchema(tests=classifier_test_dataset)
122
+ response = rag_service.test_classifier(
123
+ tests=req
124
+ )
125
+
126
+ rag_service.evaluation = response["evaluation"]
127
+ return rag_service.evaluation
128
+ except Exception as e:
129
+ logger.error(f"Test classifier Execution failed: {e}")
130
+ raise HTTPException(status_code=500, detail=str(e))
131
+
132
+
133
+ @router.post("/similarity_search")
134
+ def similarity_search(
135
+ request: RAGRequest,
136
+ rag_service: RAGService = Depends(get_rag_service) # ✅ Dependency injection!
137
+ ):
138
+ """
139
+ Query the RAG system with a question.
140
+
141
+ Args:
142
+ request: RAG request with question and history
143
+ rag_service: Injected RAG service instance
144
+
145
+ Returns:
146
+ Answer with references and metadata
147
+ """
148
+ try:
149
+
150
+
151
+ response = rag_service.hybrid_query(
152
+ question=request.question,
153
+ history=request.history or [],
154
+ k=request.k,
155
+ threshold=request.threshold,
156
+ include_llm_response=False,
157
+ )
158
+ return response
159
+ except Exception as e:
160
+ logger.error(f"RAG query failed: {e}")
161
+ raise HTTPException(status_code=500, detail=str(e))
162
+
163
+ @router.post("/search")
164
+ def search(
165
+ request: RAGRequest,
166
+ rag_service: RAGService = Depends(get_rag_service) # ✅ Dependency injection!
167
+ ):
168
+ """
169
+ Query the RAG system with a question.
170
+
171
+ Args:
172
+ request: RAG request with question and history
173
+ rag_service: Injected RAG service instance
174
+
175
+ Returns:
176
+ Answer with references and metadata
177
+ """
178
+ try:
179
+ response = rag_service.search_docs(
180
+ question=request.question,
181
+ k=request.k
182
+ )
183
+ return response
184
+ except Exception as e:
185
+ logger.error(f"RAG query failed: {e}")
186
+ raise HTTPException(status_code=500, detail=str(e))
app/api/routes/settings.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, status
2
+ from app.core.config import settings
3
+ from app.api.schemas.settings import SettingsUpdate, SettingsResponse
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+ router = APIRouter()
8
+
9
+
10
+ @router.get("/", response_model=SettingsResponse)
11
+ def get_settings():
12
+ """
13
+ Get current application settings.
14
+
15
+ Returns all configurable settings including RAG parameters,
16
+ model configuration, and API settings.
17
+ """
18
+ return SettingsResponse(
19
+ # Paths (read-only)
20
+ root_path=str(settings.root_path),
21
+ model_path=str(settings.model_path),
22
+ data_path=str(settings.data_path),
23
+
24
+ # API Settings
25
+ api_title=settings.api_title,
26
+ api_version=settings.api_version,
27
+ cors_origins=settings.cors_origins,
28
+
29
+ # RAG Settings
30
+ chunk_size=settings.chunk_size,
31
+ chunk_overlap=settings.chunk_overlap,
32
+ similarity_top_k=settings.similarity_top_k,
33
+ similarity_threshold=settings.similarity_threshold,
34
+ collection_name=settings.collection_name,
35
+ persist_directory=settings.persist_directory,
36
+
37
+ # Model Settings
38
+ llm_provider=settings.llm_provider,
39
+ enable_fallback=settings.enable_fallback,
40
+ embedding_model_name=settings.embedding_model_name,
41
+ gemini_model_name=settings.gemini_model_name,
42
+ local_model_name=settings.local_model_name,
43
+ )
44
+
45
+
46
+ @router.patch("/", response_model=SettingsResponse)
47
+ def update_settings(updates: SettingsUpdate):
48
+ """
49
+ Update application settings at runtime.
50
+
51
+ Only provided fields will be updated. Omitted fields remain unchanged.
52
+
53
+ **Note:** Changes are runtime-only and will be lost on server restart.
54
+ To persist changes, update the `.env` file.
55
+
56
+ **Warning:** Some changes (like CORS origins) may require server restart
57
+ to take full effect.
58
+ """
59
+ updated_fields = []
60
+
61
+ # Update RAG settings
62
+ if updates.chunk_size is not None:
63
+ settings.chunk_size = updates.chunk_size
64
+ updated_fields.append("chunk_size")
65
+
66
+ if updates.chunk_overlap is not None:
67
+ settings.chunk_overlap = updates.chunk_overlap
68
+ updated_fields.append("chunk_overlap")
69
+
70
+ if updates.similarity_top_k is not None:
71
+ settings.similarity_top_k = updates.similarity_top_k
72
+ updated_fields.append("similarity_top_k")
73
+
74
+ if updates.similarity_threshold is not None:
75
+ settings.similarity_threshold = updates.similarity_threshold
76
+ updated_fields.append("similarity_threshold")
77
+
78
+ # Update Model settings
79
+ if updates.llm_provider is not None:
80
+ settings.llm_provider = updates.llm_provider
81
+ updated_fields.append("llm_provider")
82
+ logger.info(f"LLM provider changed to: {updates.llm_provider}")
83
+
84
+ if updates.enable_fallback is not None:
85
+ settings.enable_fallback = updates.enable_fallback
86
+ updated_fields.append("enable_fallback")
87
+
88
+ if updates.gemini_model_name is not None:
89
+ settings.gemini_model_name = updates.gemini_model_name
90
+ updated_fields.append("gemini_model_name")
91
+
92
+ if updates.local_model_name is not None:
93
+ settings.local_model_name = updates.local_model_name
94
+ updated_fields.append("local_model_name")
95
+
96
+ # Update API settings
97
+ if updates.cors_origins is not None:
98
+ settings.cors_origins = updates.cors_origins
99
+ updated_fields.append("cors_origins")
100
+ logger.warning("CORS origins updated. Server restart may be required for full effect.")
101
+
102
+ logger.info(f"Settings updated: {', '.join(updated_fields)}")
103
+
104
+ # Return updated settings
105
+ return get_settings()
106
+
107
+
108
+ @router.post("/reset")
109
+ def reset_settings():
110
+ """
111
+ Reset all settings to defaults from .env file.
112
+
113
+ This reloads settings from the environment file and discards
114
+ any runtime changes.
115
+
116
+ **Warning:** This will restart the settings object and may cause
117
+ temporary service interruption.
118
+ """
119
+ try:
120
+ # Reload settings from .env
121
+ from app.core.config import Settings
122
+ new_settings = Settings()
123
+
124
+ # Update the global settings object
125
+ for key, value in new_settings.dict().items():
126
+ setattr(settings, key, value)
127
+
128
+ logger.info("Settings reset to defaults from .env")
129
+
130
+ return {
131
+ "message": "Settings reset to defaults",
132
+ "status": "success"
133
+ }
134
+ except Exception as e:
135
+ logger.error(f"Failed to reset settings: {e}")
136
+ raise HTTPException(
137
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
138
+ detail=f"Failed to reset settings: {str(e)}"
139
+ )
140
+
141
+
142
+ @router.get("/rag")
143
+ def get_rag_settings():
144
+ """
145
+ Get only RAG-related settings.
146
+
147
+ Returns chunk sizes, similarity parameters, and vector store configuration.
148
+ """
149
+ return {
150
+ "chunk_size": settings.chunk_size,
151
+ "chunk_overlap": settings.chunk_overlap,
152
+ "similarity_top_k": settings.similarity_top_k,
153
+ "similarity_threshold": settings.similarity_threshold,
154
+ "collection_name": settings.collection_name,
155
+ "persist_directory": settings.persist_directory,
156
+ }
157
+
158
+
159
+ @router.get("/models")
160
+ def get_model_settings():
161
+ """
162
+ Get only model-related settings.
163
+
164
+ Returns LLM provider, model names, and fallback configuration.
165
+ """
166
+ return {
167
+ "llm_provider": settings.llm_provider,
168
+ "enable_fallback": settings.enable_fallback,
169
+ "embedding_model_name": settings.embedding_model_name,
170
+ "gemini_model_name": settings.gemini_model_name,
171
+ "local_model_name": settings.local_model_name,
172
+ }
173
+
174
+
175
+ @router.get("/api")
176
+ def get_api_settings():
177
+ """
178
+ Get only API-related settings.
179
+
180
+ Returns API metadata and CORS configuration.
181
+ """
182
+ return {
183
+ "api_title": settings.api_title,
184
+ "api_version": settings.api_version,
185
+ "cors_origins": settings.cors_origins,
186
+ }
app/api/routes/vector_store.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
2
+ from langchain_core.documents import Document
3
+ from app.api.dependencies import get_rag_service
4
+ from app.core.config import settings
5
+ from app.api.schemas.requests import (
6
+ deleteDocs,
7
+ DocumentType,
8
+ PaginationParams,
9
+ DocumentFilters,
10
+ DeleteFilters,
11
+ )
12
+ from fastapi.responses import JSONResponse
13
+ from pathlib import Path
14
+ from typing import Optional, List, Dict, Any
15
+ from fastapi import Path as Params
16
+ from app.services.rag_service import RAGService
17
+ from app.services.ingestion_service import ingestion_service
18
+
19
+ import os
20
+ import shutil
21
+ import math
22
+
23
+ # Temp directory for uploaded files before ingestion
24
+ UPLOAD_TEMP_PATH = settings.root_path / "temp"
25
+
26
+ router = APIRouter()
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Shared helpers
31
+ # ---------------------------------------------------------------------------
32
+
33
+ def _unpack_chroma_result(result: dict):
34
+ """Unpack a raw ChromaDB result dict into parallel lists."""
35
+ ids = result.get("ids", [])
36
+ docs = result.get("documents", [])
37
+ metas = result.get("metadatas", [])
38
+ return ids, docs, metas
39
+
40
+
41
+ def _apply_doc_filters(
42
+ ids: List[str],
43
+ docs: List[str],
44
+ metas: List[Dict[str, Any]],
45
+ filters: DocumentFilters,
46
+ ) -> List[Dict]:
47
+ """Filter a Chroma result set by DocumentFilters and return shaped dicts."""
48
+ filtered = []
49
+ for i in range(len(ids)):
50
+ doc_text = docs[i]
51
+ meta = metas[i] if metas else {}
52
+
53
+ if filters.filename and meta.get("source_file") != filters.filename:
54
+ continue
55
+ if filters.source and meta.get("source") != filters.source:
56
+ continue
57
+ if filters.contains and filters.contains.lower() not in doc_text.lower():
58
+ continue
59
+
60
+ filtered.append({"id": ids[i], "content": doc_text, "metadata": meta})
61
+ return filtered
62
+
63
+
64
+ def _save_upload(file: UploadFile) -> Path:
65
+ """Save an uploaded file to the temp directory and return its path."""
66
+ UPLOAD_TEMP_PATH.mkdir(parents=True, exist_ok=True)
67
+ file_path = UPLOAD_TEMP_PATH / file.filename
68
+ with open(file_path, "wb") as buffer:
69
+ shutil.copyfileobj(file.file, buffer)
70
+ return file_path
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # GET /filenames ← must be before GET /{id} to avoid route conflict
75
+ # ---------------------------------------------------------------------------
76
+
77
+ @router.get("/filenames")
78
+ def list_filenames(rag_service: RAGService = Depends(get_rag_service)):
79
+ """Return a list of unique ingested filenames."""
80
+ return rag_service.get_filenames()
81
+
82
+
83
+ # ---------------------------------------------------------------------------
84
+ # GET /
85
+ # ---------------------------------------------------------------------------
86
+
87
+ @router.get("/")
88
+ def list_documents(
89
+ params: PaginationParams = Depends(),
90
+ filters: DocumentFilters = Depends(),
91
+ rag_service: RAGService = Depends(get_rag_service),
92
+ ):
93
+ """
94
+ List all documents with pagination and optional filters.
95
+
96
+ Query Parameters:
97
+ - page: Page number (default: 1)
98
+ - limit: Items per page (default: 10, max: 100)
99
+ - order: Sort order – "asc" or "desc" (default: "desc")
100
+ - filename: Filter by source_file metadata
101
+ - source: Filter by source metadata path
102
+ - contains: Filter by text content (case-insensitive)
103
+ """
104
+ ids, docs, metas = [], [], []
105
+ if filters.contains:
106
+ documents = rag_service.search_docs(
107
+ question=filters.contains,
108
+ k=params.limit,
109
+ filename = filters.filename
110
+ )
111
+ filtered = documents
112
+ print(filtered)
113
+ else:
114
+ ids, docs, metas = _unpack_chroma_result(rag_service.database.get())
115
+ filtered = _apply_doc_filters(ids, docs, metas, filters)
116
+
117
+ # Sort by creation date
118
+ reverse = params.order == "desc"
119
+ filtered.sort(
120
+ key=lambda x: x.get("metadata", {}).get("creationdate", ""),
121
+ reverse=reverse,
122
+ )
123
+ # Paginate
124
+ total_docs = len(filtered)
125
+ total_pages = math.ceil(total_docs / params.limit) if total_docs > 0 else 0
126
+ start = (params.page - 1) * params.limit
127
+ paginated = filtered[start : start + params.limit]
128
+
129
+ return {
130
+ "page": params.page,
131
+ "limit": params.limit,
132
+ "total_docs": total_docs,
133
+ "total_pages": total_pages,
134
+ "order": params.order,
135
+ "data": paginated,
136
+ "status": 200,
137
+ }
138
+
139
+
140
+ # ---------------------------------------------------------------------------
141
+ # GET /{id}
142
+ # ---------------------------------------------------------------------------
143
+
144
+ @router.get("/{id}")
145
+ def get_document(
146
+ id: str = Params(...),
147
+ rag_service: RAGService = Depends(get_rag_service),
148
+ ):
149
+ """Fetch a single document by its ChromaDB ID."""
150
+ if not id:
151
+ raise HTTPException(status_code=400, detail="Document ID is required")
152
+
153
+ result = rag_service.database.get_by_id(ids=[id])
154
+ ids, docs, metas = _unpack_chroma_result(result)
155
+
156
+ data = [
157
+ {"id": ids[i], "document": docs[i], "metadata": metas[i] if metas else {}}
158
+ for i in range(len(ids))
159
+ ]
160
+ return {"data": data, "status": 200}
161
+
162
+
163
+ # ---------------------------------------------------------------------------
164
+ # POST / (file upload + ingestion)
165
+ # ---------------------------------------------------------------------------
166
+
167
+ SUPPORTED_EXTENSIONS = {".md", ".pdf", ".json", ".txt"}
168
+
169
+ @router.post("/")
170
+ def upload_document(
171
+ file: UploadFile = File(...),
172
+ title: Optional[str] = None,
173
+ rag_service: RAGService = Depends(get_rag_service),
174
+ ):
175
+ """
176
+ Upload and ingest a document file into the vector store.
177
+ Supported types: .md, .pdf, .json, .txt
178
+ """
179
+ file_path = _save_upload(file)
180
+ ext = file_path.suffix.lower()
181
+
182
+ if ext not in SUPPORTED_EXTENSIONS:
183
+ file_path.unlink(missing_ok=True)
184
+ raise HTTPException(status_code=400, detail=f"Unsupported file type: {ext}")
185
+
186
+ docs = rag_service.ingest_documents(file_path)
187
+ file_path.unlink(missing_ok=True)
188
+
189
+ if not docs:
190
+ raise HTTPException(status_code=400, detail="No content could be extracted from the file")
191
+
192
+ return JSONResponse({
193
+ "filename": file.filename,
194
+ "message": f"{ext} uploaded and ingested successfully",
195
+ "docs_added": len(docs),
196
+ "status": 200,
197
+ })
198
+
199
+
200
+ # ---------------------------------------------------------------------------
201
+ # PUT /{id}
202
+ # ---------------------------------------------------------------------------
203
+
204
+ @router.put("/{id}")
205
+ def update_document(
206
+ doc: DocumentType,
207
+ id: str = Params(...),
208
+ rag_service: RAGService = Depends(get_rag_service),
209
+ ):
210
+ """Update an existing document's content and metadata by ID."""
211
+ if not id:
212
+ raise HTTPException(status_code=400, detail="Document ID is required")
213
+
214
+ content = doc.document.strip()
215
+ if not content:
216
+ raise HTTPException(status_code=400, detail="Document content cannot be empty")
217
+
218
+ updated_document = Document(
219
+ page_content=content,
220
+ metadata={**doc.metadata, "id": id},
221
+ )
222
+ rag_service.database.update_document(id, updated_document)
223
+
224
+ return {"status": 200, "message": f"{id} updated successfully"}
225
+
226
+
227
+ # ---------------------------------------------------------------------------
228
+ # DELETE /ids (bulk delete by explicit ID list)
229
+ # ---------------------------------------------------------------------------
230
+
231
+ @router.delete("/ids")
232
+ def delete_documents_by_ids(
233
+ body: deleteDocs,
234
+ rag_service: RAGService = Depends(get_rag_service),
235
+ ):
236
+ """Delete multiple documents by providing an explicit list of IDs."""
237
+ result = rag_service.database.delete(body.docs)
238
+ return {"message": "Documents deleted successfully", "deleted": len(body.docs), "result": result}
239
+
240
+
241
+ # ---------------------------------------------------------------------------
242
+ # DELETE /{id} (single delete)
243
+ # ---------------------------------------------------------------------------
244
+
245
+ @router.delete("/{id}")
246
+ def delete_document(
247
+ id: str = Params(...),
248
+ rag_service: RAGService = Depends(get_rag_service),
249
+ ):
250
+ """Delete a single document by its ChromaDB ID."""
251
+ if not id:
252
+ raise HTTPException(status_code=400, detail="Document ID is required")
253
+
254
+ result = rag_service.database.delete([id])
255
+ return {"message": "Document deleted successfully", "deleted": 1, "result": result, "status": 200}
256
+
257
+
258
+ # ---------------------------------------------------------------------------
259
+ # DELETE / (filter-based delete)
260
+ # ---------------------------------------------------------------------------
261
+
262
+ @router.delete("/")
263
+ def delete_documents_by_filter(
264
+ filters: DeleteFilters = Depends(),
265
+ rag_service: RAGService = Depends(get_rag_service),
266
+ ):
267
+ """
268
+ Delete documents matching filter criteria.
269
+
270
+ Query Parameters:
271
+ - filename: Delete documents with this source_file value
272
+ - source: Delete documents with this source path
273
+ - contains: Delete documents whose text contains this string
274
+ - dry_run: Preview matching docs without deleting (default: false)
275
+ """
276
+ ids, docs, metas = _unpack_chroma_result(rag_service.database.get())
277
+
278
+ delete_ids = []
279
+ for i in range(len(ids)):
280
+ doc_text = docs[i]
281
+ meta = metas[i] if metas else {}
282
+
283
+ if filters.source:
284
+ stored_source = str(Path(meta.get("source", "")).resolve())
285
+ input_source = str(Path(filters.source).resolve())
286
+ if stored_source != input_source:
287
+ continue
288
+
289
+ if filters.filename and meta.get("source_file") != filters.filename:
290
+ continue
291
+
292
+ if filters.contains and filters.contains.lower() not in doc_text.lower():
293
+ continue
294
+
295
+ delete_ids.append(ids[i])
296
+
297
+ if filters.filename:
298
+ ingestion_service.delete_record(filters.filename)
299
+
300
+ if not delete_ids:
301
+ return {"message": "No matching documents found", "deleted": 0}
302
+
303
+ if filters.dry_run:
304
+ return {
305
+ "message": "Dry run – no documents deleted",
306
+ "matched_count": len(delete_ids),
307
+ "matched_ids": delete_ids,
308
+ }
309
+
310
+ result = rag_service.database.delete(delete_ids)
311
+ return {"message": "Documents deleted successfully", "deleted": len(delete_ids), "result": result}
app/api/schemas/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .tests import TestResponseSchema, TestRequestSchema
app/api/schemas/requests.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Optional, Literal, Annotated, Dict, Any
3
+ from app.core.config import settings
4
+
5
+
6
+ # ✅ Query Parameter Schemas - Clean and Reusable!
7
+
8
+ class PaginationParams(BaseModel):
9
+ """Pagination query parameters for list endpoints."""
10
+ page: int = Field(default=1, ge=1, description="Page number (starts at 1)")
11
+ limit: int = Field(default=10, ge=1, le=100, description="Items per page (max 100)")
12
+ order: Literal["asc", "desc"] = Field(default="desc", description="Sort order")
13
+
14
+
15
+ class DocumentFilters(BaseModel):
16
+ """Document filtering query parameters."""
17
+ filename: Optional[str] = Field(default=None, description="Filter by exact filename")
18
+ source: Optional[str] = Field(default=None, description="Filter by source path")
19
+ contains: Optional[str] = Field(default=None, description="Filter by text content (case-insensitive)")
20
+
21
+
22
+ class DeleteFilters(BaseModel):
23
+ """Delete operation filters with dry-run support."""
24
+ filename: Optional[str] = Field(default=None, description="Delete documents with this filename")
25
+ source: Optional[str] = Field(default=None, description="Delete documents from this source")
26
+ contains: Optional[str] = Field(default=None, description="Delete documents containing this text")
27
+ dry_run: bool = Field(default=False, description="Preview deletions without executing")
28
+
29
+
30
+ # Request Body Schemas
31
+
32
+ class RAGRequest(BaseModel):
33
+ """Request schema for RAG query endpoint."""
34
+ question: Annotated[str, Field(min_length=1, description="Question that user wants to ask")]
35
+ history: Annotated[Optional[List[str]], Field(default=[], description="Previously Asked Questions")]
36
+ k: int = Field(default=settings.similarity_top_k, ge=1, le=100, description="Number of similar documents to retrieve")
37
+ threshold: float = Field(default=settings.similarity_threshold, ge=0, le=1, description="Similarity threshold")
38
+ include_llm_response: bool = Field(default=True, description="Whether to generate LLM answer")
39
+
40
+
41
+ class Query(BaseModel):
42
+ """Query result schema."""
43
+ def __init__(self, question: str, answer: str):
44
+ self.question = question
45
+ self.answer = answer
46
+
47
+
48
+ class deleteDocs(BaseModel):
49
+ """Request schema for bulk delete by IDs."""
50
+ docs: Annotated[List[str], Field(min_length=1, description="List of IDs that you want to delete!")]
51
+
52
+
53
+ class DocumentType(BaseModel):
54
+ """Document update schema."""
55
+ id: str
56
+ metadata: Dict[str, Any]
57
+ document: str
58
+
59
+ class SimilaritySearch(BaseModel):
60
+ """Search The Best Params for Similarity Search"""
61
+ query: str
62
+ k: int = Field(default=settings.similarity_top_k, ge=1, le=100, description="Number of similar documents to retrieve")
63
+ threshold: float = Field(default=settings.similarity_threshold, ge=0, le=1, description="Similarity threshold")
64
+ history: Annotated[Optional[List[str]], Field(default=[], description="Previously Asked Questions")]
65
+
66
+
67
+ class TextIngestRequest(BaseModel):
68
+ """Request schema for raw text ingestion."""
69
+ text: str = Field(..., min_length=1, description="Raw text content to ingest")
70
+ metadata: Dict[str, Any] = Field(default_factory=dict, description="Optional metadata (title, source, etc.)")
71
+ filename: Optional[str] = Field(default=None, description="Virtual filename/source for the document")
app/api/schemas/settings.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Optional, Literal
3
+ from app.core.config import settings
4
+
5
+
6
+ class SettingsUpdate(BaseModel):
7
+ """Schema for updating application settings."""
8
+
9
+ # RAG Settings
10
+ chunk_size: Optional[int] = Field(default=settings.chunk_size, ge=100, le=5000, description="Text chunk size")
11
+ chunk_overlap: Optional[int] = Field(default=settings.chunk_overlap, ge=0, le=1000, description="Chunk overlap size")
12
+ similarity_top_k: Optional[int] = Field(default=settings.similarity_top_k, ge=1, le=20, description="Number of similar docs to retrieve")
13
+ similarity_threshold: Optional[float] = Field(default=settings.similarity_threshold, ge=0, le=1, description="Similarity threshold")
14
+
15
+ # Model Settings
16
+ llm_provider: Optional[Literal["gemini", "local"]] = Field(default=settings.llm_provider, description="LLM provider to use")
17
+ enable_fallback: Optional[bool] = Field(default=settings.enable_fallback, description="Enable fallback to alternate model")
18
+ gemini_model_name: Optional[str] = Field(default=settings.gemini_model_name, description="Gemini model name")
19
+ local_model_name: Optional[str] = Field(default=settings.local_model_name, description="Local model filename")
20
+
21
+ # API Settings
22
+ cors_origins: Optional[List[str]] = Field(default=settings.cors_origins, description="Allowed CORS origins")
23
+
24
+
25
+ class SettingsResponse(BaseModel):
26
+ """Schema for settings response."""
27
+
28
+ # Paths (read-only)
29
+ root_path: str
30
+ model_path: str
31
+ data_path: str
32
+
33
+ # API Settings
34
+ api_title: str
35
+ api_version: str
36
+ cors_origins: List[str]
37
+
38
+ # RAG Settings
39
+ chunk_size: int
40
+ chunk_overlap: int
41
+ similarity_top_k: int
42
+ similarity_threshold: float
43
+ collection_name: str
44
+ persist_directory: str
45
+
46
+ # Model Settings
47
+ llm_provider: str
48
+ enable_fallback: bool
49
+ embedding_model_name: str
50
+ gemini_model_name: str
51
+ local_model_name: str
52
+
53
+ class Config:
54
+ from_attributes = True
app/api/schemas/tests.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import List
3
+
4
+ class Test(BaseModel):
5
+ question: str = Field(min_length=1, max_length=100, description="Question you want to test")
6
+ document: str = Field(min_length=1, description="Document name")
7
+ chunk_index: int = Field(default=0, min=0, description="Chunk index")
8
+
9
+ class TestRequestSchema(BaseModel):
10
+ tests: List[Test] = Field(min_length=1, description="give tests to evalute")
11
+ k: int = Field(default=5, min=0, max=20, description="maximum number of results")
12
+ threshold: float = Field(default= 0.4, min=0.0, max=1.0, description="Threshold for reference")
13
+
14
+ class TestResponse(BaseModel):
15
+ tests: Test
16
+ answer: bool
17
+
18
+ class TestResponseSchema(BaseModel):
19
+ tests: List[TestResponse] = Field(min_length=1, description="test results")
20
+
21
+
22
+ class TestClassifier(BaseModel):
23
+ question: str = Field(min_length=1, description="Question you want to test")
24
+ type: str = Field(min_length=1, description="Type to be predicted")
25
+ category: str = Field(min_length=1, description="Category to be predicted")
26
+ topic: str = Field(min_length=1, description="Topic to be predicted")
27
+ intent: str = Field(min_length=1, description="Intent to be predicted")
28
+
29
+ class TestClassifierReqSchema(BaseModel):
30
+ tests: List[TestClassifier] = Field(min_length=1, description="give tests to evalute")
app/core/__init__.py ADDED
File without changes
app/core/config.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+ from pathlib import Path
3
+ from typing import Literal
4
+
5
+ class Settings(BaseSettings):
6
+ # Paths
7
+ root_path: Path = Path(__file__).resolve().parents[2]
8
+ core_models_path: Path = root_path / "ml_models"
9
+ model_path: Path = core_models_path / "llm"
10
+ embeddings_path: Path = core_models_path / "embeddings"
11
+ data_path: Path = root_path / "data"
12
+ documents_path: Path = data_path / "documents"
13
+ vector_stores_path: Path = data_path / "vector_stores"
14
+ classifier_path: Path = core_models_path / "classifier"
15
+
16
+ # API Settings
17
+ api_title: str = "VGEC RAG Chatbot API"
18
+ api_version: str = "1.0.0"
19
+ cors_origins: list[str] = ["*"]
20
+
21
+ # RAG Settings
22
+ chunk_size: int = 500
23
+ chunk_overlap: int = 100
24
+ similarity_top_k: int = 8
25
+ similarity_threshold: float = 0.4 # ✅ NEW - Filter docs by similarity score
26
+ collection_name: str = "classifier_test_1"
27
+ persist_directory: str = str(vector_stores_path / collection_name)
28
+
29
+ # Model Selection - ✅ NEW!
30
+ llm_provider: Literal["gemini", "local"] = "gemini" # Which model to use
31
+ enable_fallback: bool = False # Fallback to local if Gemini fails
32
+
33
+
34
+ # Model Settings
35
+ embedding_model_name: str = "models/gemini-embedding-001"
36
+ gemini_model_name: str = "gemini-2.5-flash-lite"
37
+ local_model_name: str = "Qwen2.5-0.5B-Instruct-Q4_K_M.gguf"
38
+ # Llama-3.2-3B-Instruct-Q4_K_M.gguf
39
+ # query_model_name: str = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
40
+ # Qwen2.5-0.5B-Instruct-Q4_K_M.gguf
41
+ # Qwen3-0.6B-Q4_K_M.gguf
42
+ # Vi-Qwen2-1.5B-RAG.Q4_K_M.gguf
43
+
44
+ # Generation Settings
45
+ max_output_tokens: int = 2048 # Max tokens for Gemini responses
46
+ local_max_tokens: int = 512 # Max tokens for local model responses
47
+
48
+ # Google API - ✅ Pydantic automatically reads from .env
49
+ google_api_key: str # No default = required field
50
+
51
+ class Config:
52
+ env_file = ".env"
53
+ env_file_encoding = "utf-8"
54
+
55
+ settings = Settings()
app/core/paths.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ ROOT_PATH = Path(__file__).resolve().parents[2]
4
+ MODEL_PATH = ROOT_PATH / "ml_models"
5
+ LIBS_PATH = ROOT_PATH / "libs"
6
+ data_path = ROOT_PATH / "data"
7
+
8
+ print(ROOT_PATH)
9
+ print(MODEL_PATH)
10
+ print(LIBS_PATH)
app/main.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, APIRouter
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from app.core.config import settings
4
+ from app.api.routes import rag, vector_store, settings as settingsRouter
5
+
6
+ app = FastAPI()
7
+
8
+ # Include individual routers
9
+ API_PREFIX = "/api/v1"
10
+
11
+ app.include_router(rag.router, prefix=f"{API_PREFIX}/rag", tags=["RAG"])
12
+ app.include_router(vector_store.router, prefix=f"{API_PREFIX}/vector", tags=["Vector Store"])
13
+ app.include_router(settingsRouter.router, prefix=f"{API_PREFIX}/settings", tags=["Settings"])
14
+
15
+ app.add_middleware(
16
+ CORSMiddleware,
17
+ allow_origins=settings.cors_origins,
18
+ allow_credentials=True,
19
+ allow_methods=["*"],
20
+ allow_headers=["*"],
21
+ )
app/models/__init__.py ADDED
File without changes
app/prompts/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .system_prompts import SYSTEM_PROMPT, QUESTION_WRITER_SYSTEM_PROMPT, wrap_exaone
app/prompts/system_prompts.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # SYSTEM_PROMPT = """
3
+ # You are VGEC-Assistant, a polite and helpful information retrieval chatbot for Vishwakarma Government Engineering College (VGEC).
4
+
5
+ # You MUST answer the user's question using ONLY the information inside the given CONTEXT. The CONTEXT is the only source of truth.
6
+ # you have to help the users and guide them to answer based on the given context, dont guess but provide answer or guide them in any way you fit.
7
+
8
+ # Output Format:
9
+ # - Always respond in plain text as complete sentences.
10
+ # - Do not add extra explanation or new facts.
11
+ # - Keep responses concise and courteous.
12
+ # - Do NOT use outside knowledge.
13
+ # - Do NOT guess.
14
+ # - Always respond in markdown format.
15
+
16
+ # ---
17
+ # HISTORY:
18
+ # {history}
19
+ # ---
20
+ # CONTEXT:
21
+ # {context}
22
+ # ---
23
+ # QUESTION:
24
+ # {question}
25
+ # ---
26
+ # ANSWER:
27
+ # """
28
+
29
+ SYSTEM_PROMPT = """
30
+ You are VGEC-Assistant, a helpful chatbot for Vishwakarma Government Engineering College (VGEC).
31
+ Answer the user's question using ONLY the information in the given CONTEXT.
32
+
33
+ If the answer can be logically inferred from the context, provide the answer clearly.
34
+ If the answer is not present in the context, say: "Sorry, I couldn't find that in the provided information."
35
+
36
+ Guidelines:
37
+ - Keep the response short and clear.
38
+ - Do not repeat the context.
39
+ - Do not guess or make assumptions.
40
+ - Answer in Markdown Format.
41
+
42
+ ---
43
+ HISTORY:
44
+ {history}
45
+ ---
46
+ CONTEXT:
47
+ {context}
48
+ ---
49
+ QUESTION:
50
+ {question}
51
+ ---
52
+ ANSWER:
53
+ """
54
+
55
+ def wrap_exaone(prompt):
56
+ return f"""[|system|]
57
+ You are a helpful AI assistant. Answer only from the given context. If unsure, say "I don't know".
58
+ [|endofturn|]
59
+
60
+ [|user|]
61
+ {prompt.strip()}
62
+ [|endofturn|]
63
+
64
+ [|assistant|]
65
+ """
66
+
67
+ QUESTION_WRITER_SYSTEM_PROMPT = """You are a query rewriting assistant for Vishwakarma Government Engineering College (VGEC).
68
+
69
+ STRICT RULES:
70
+ 1. Expand abbreviations using ONLY this mapping:
71
+ - IT = Information Technology Department
72
+ - ICT = Information and Communication Technology Department
73
+ - CE = Computer Engineering Department
74
+ - EC = Electronics and Communication Engineering Department
75
+ - IC = Instrumentation and Control Engineering Department
76
+ - PE = Power Electronics Department
77
+ - ME = Mechanical Engineering Department
78
+ - Civil = Civil Engineering Department
79
+ - CSE = Computer Science & Engineering (Data Science) Department
80
+ - DS = Computer Science & Engineering (Data Science) Department
81
+ - ACPC = Admission Committee for Professional Courses (administrative, NOT a department)
82
+ - STS = Student Section Portal (administrative, NOT a department)
83
+
84
+ 2. CRITICAL: If query has NO department abbreviation, do NOT add any department.
85
+
86
+ 3. Output ONLY the rewritten query. No quotes, no prefixes, no explanations.
87
+
88
+ GOOD EXAMPLES:
89
+ User: "ds fees?"
90
+ Rewritten: What are the fees for the Computer Science & Engineering (Data Science) Department?
91
+
92
+ User: "cse block?"
93
+ Rewritten: Which block houses the Computer Science & Engineering (Data Science) Department?
94
+
95
+ User: "fees"
96
+ Rewritten: What are the fees?
97
+
98
+ User: "admission"
99
+ Rewritten: What is the admission process?
100
+
101
+ User: "acpc registration"
102
+ Rewritten: What is the ACPC registration process?
103
+
104
+ BAD EXAMPLES (NEVER DO THIS):
105
+ User: "fees"
106
+ Bad: What are the fees for the Computer Science & Engineering (Data Science) Department?
107
+
108
+ User: "placement"
109
+ Bad: What are the placement statistics for the Mechanical Engineering Department?
110
+
111
+ Query: {query}
112
+ Rewritten:"""
app/services/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # This file is intentionally empty to prevent circular imports.
2
+ # Import services directly from their modules.
app/services/classifier_service.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import numpy as np
3
+ import pickle
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.linear_model import LogisticRegression
6
+ from sklearn.model_selection import train_test_split, cross_val_score
7
+ from app.core.config import settings
8
+ from typing import List, Optional
9
+
10
+ def load_pipeline(path):
11
+ with open(path, "rb") as f:
12
+ pipeline = pickle.load(f)
13
+
14
+ return pipeline
15
+
16
+ class Classifier:
17
+ def __init__(
18
+ self,
19
+ tfidf,
20
+ abbreviations,
21
+ master_index,
22
+ le_type,
23
+ le_category,
24
+ le_topic,
25
+ le_intent,
26
+ models=None,
27
+ df=None,
28
+ ):
29
+ self.tfidf = tfidf
30
+ self.abbreviations = abbreviations
31
+ self.master_index = master_index
32
+
33
+ self.le_type = le_type
34
+ self.le_category = le_category
35
+ self.le_topic = le_topic
36
+ self.le_intent = le_intent
37
+ model_path = settings.embeddings_path / "mdbr-leaf-mt"
38
+ if model_path.exists():
39
+ self.embedding_model = SentenceTransformer(str(model_path))
40
+ else:
41
+ self.embedding_model = SentenceTransformer("MongoDB/mdbr-leaf-mt")
42
+
43
+ # Prediction thresholds: below these, the field is set to None entirely
44
+ self.threshold = {
45
+ "type": 0.4,
46
+ "category": 0.4,
47
+ "topic": 0.5,
48
+ "intent": 0.6
49
+ }
50
+
51
+ # Filter thresholds: above these, the field is used as a hard ChromaDB filter
52
+ # Kept separate so you can tune "when to predict" vs "when to filter" independently
53
+ self.filter_threshold = {
54
+ "type": 0.65,
55
+ "category": 0.65,
56
+ "topic": 0.70,
57
+ }
58
+
59
+ # If trained models are passed
60
+ if models is not None:
61
+ self.models = models
62
+ else:
63
+ if df is None:
64
+ raise ValueError("Either provide trained models or provide df to train.")
65
+ self.models = self.train_models(df)
66
+
67
+ def _build_filter(self, result):
68
+ # If type confidence doesn't clear the filter bar, the entire filter
69
+ # is unreliable — return None so retrieval does a full scan instead.
70
+ if result.get("type_conf", 0) < self.filter_threshold["type"]:
71
+ return None
72
+
73
+ # --- Hard AND anchors (always reliable) ---
74
+ hard_conditions = []
75
+ hard_conditions.append({"type": result["type"]})
76
+
77
+ # intent — handles special case for "count" to include "detail"
78
+ intent = result.get("intent") or "detail"
79
+ if intent == "count":
80
+ hard_conditions.append({"$or": [{"intent": "count"}, {"intent": "detail"}]})
81
+ else:
82
+ hard_conditions.append({"intent": intent})
83
+
84
+ # --- Soft OR hints (category / topic) ---
85
+ # A document only needs to match ONE of these to pass.
86
+ # This avoids dropping valid docs that are tagged with category but
87
+ # not topic (or vice-versa), while still keeping retrieval directional.
88
+ soft_conditions = []
89
+
90
+ if result.get("category") and result.get("category_conf", 0) >= self.filter_threshold["category"]:
91
+ soft_conditions.append({"category": result["category"]})
92
+ else:
93
+ soft_conditions.append({"category": "general"})
94
+
95
+ if result.get("topic") and result.get("topic_conf", 0) >= self.filter_threshold["topic"]:
96
+ soft_conditions.append({"topic": result["topic"]})
97
+ else:
98
+ soft_conditions.append({"topic": "general"})
99
+
100
+ # Build final filter
101
+ # Case 1: No soft hints — filter on hard anchors only (broad query like "list all departments")
102
+ if not soft_conditions:
103
+ if len(hard_conditions) == 1:
104
+ return hard_conditions[0]
105
+ return {"$and": hard_conditions}
106
+
107
+ # Case 2: One soft hint — add it directly to the AND (no $or needed)
108
+ if len(soft_conditions) == 1:
109
+ return {"$and": hard_conditions + soft_conditions}
110
+
111
+ # Case 3: Both category and topic are confident — combine as $or inside the AND
112
+ # Final shape: type AND intent AND (category OR topic)
113
+ return {"$and": hard_conditions + [{"$or": soft_conditions}]}
114
+
115
+ def predict_with_filter(self, queries):
116
+ filters = self.predict(queries)[0]
117
+ return self._build_filter(filters)
118
+
119
+ def expand_abbreviations(self, text):
120
+ text = text.lower().strip()
121
+ for abbr, full in self.abbreviations.items():
122
+ pattern = r'\b' + re.escape(abbr.lower()) + r'\b' # ← lowercase the key too
123
+ text = re.sub(pattern, full, text)
124
+ return text
125
+
126
+ def get_features(self, queries):
127
+
128
+ queries_clean = [self.expand_abbreviations(q) for q in queries]
129
+
130
+ embeddings = self.embedding_model.encode(
131
+ queries_clean, show_progress_bar=False
132
+ )
133
+
134
+ if not hasattr(self.tfidf, "vocabulary_"):
135
+ tfidf_features = self.tfidf.fit_transform(queries_clean).toarray()
136
+ else:
137
+ tfidf_features = self.tfidf.transform(queries_clean).toarray()
138
+
139
+ return np.hstack([embeddings, tfidf_features])
140
+
141
+ def train_single(self, X, y, field, C=0.01):
142
+
143
+ X_train, X_test, y_train, y_test = train_test_split(
144
+ X, y,
145
+ test_size=0.2,
146
+ random_state=42,
147
+ stratify=y
148
+ )
149
+
150
+ clf = LogisticRegression(
151
+ C=C,
152
+ penalty="l2",
153
+ solver="lbfgs",
154
+ max_iter=2000,
155
+ class_weight="balanced",
156
+ random_state=42
157
+ )
158
+
159
+ clf.fit(X_train, y_train)
160
+
161
+ train_acc = clf.score(X_train, y_train)
162
+ test_acc = clf.score(X_test, y_test)
163
+
164
+ cv_scores = cross_val_score(clf, X, y, cv=5)
165
+
166
+ print(f"\n{field.upper()}:")
167
+ print(f"Train: {train_acc:.3f} | Test: {test_acc:.3f} | CV: {cv_scores.mean():.3f}")
168
+
169
+ return clf
170
+
171
+ def train_models(self, df):
172
+
173
+ X = self.get_features(df["question"].tolist())
174
+
175
+ self.models["type"] = self.train_single(
176
+ X, df["type"].values, "type", C=0.01
177
+ )
178
+
179
+ self.models["category"] = self.train_single(
180
+ X, df["category"].values, "category", C=0.005
181
+ )
182
+
183
+ self.models["topic"] = self.train_single(
184
+ X, df["topic"].values, "topic", C=0.005
185
+ )
186
+
187
+ self.models["intent"] = self.train_single(
188
+ X, df["intent"].values, "intent", C=0.005
189
+ )
190
+
191
+ return self.models
192
+
193
+ def predict(self, queries: List[str], enforce_constraints=True):
194
+
195
+ X = self.get_features(queries)
196
+ results = []
197
+
198
+ for i, query in enumerate(queries):
199
+
200
+ res = {"question": query}
201
+
202
+ # ---------- TYPE ----------
203
+ type_proba = self.models["type"].predict_proba([X[i]])[0]
204
+ type_classes = self.models["type"].classes_
205
+
206
+ type_idx = np.argmax(type_proba)
207
+ type_pred = type_classes[type_idx]
208
+
209
+ res["type"] = self.le_type.inverse_transform([type_pred])[0]
210
+ res["type_conf"] = float(type_proba[type_idx])
211
+
212
+ # ---------- CATEGORY ----------
213
+ category_proba = self.models["category"].predict_proba([X[i]])[0]
214
+ category_classes = self.models["category"].classes_
215
+
216
+ if enforce_constraints:
217
+
218
+ category_labels = self.le_category.inverse_transform(category_classes)
219
+ allowed = set(self.master_index[res["type"]]["categories"])
220
+
221
+ filtered = [
222
+ (label, prob)
223
+ for label, prob in zip(category_labels, category_proba)
224
+ if label in allowed
225
+ ]
226
+
227
+ if filtered:
228
+ best_category, best_prob = max(filtered, key=lambda x: x[1])
229
+ else:
230
+ idx = np.argmax(category_proba)
231
+ best_category = category_labels[idx]
232
+ best_prob = category_proba[idx]
233
+
234
+ res["category"] = best_category
235
+ res["category_conf"] = float(best_prob)
236
+
237
+ else:
238
+
239
+ idx = np.argmax(category_proba)
240
+ pred = category_classes[idx]
241
+
242
+ res["category"] = self.le_category.inverse_transform([pred])[0]
243
+ res["category_conf"] = float(category_proba[idx])
244
+
245
+ # ---------- TOPIC ----------
246
+ topic_proba = self.models["topic"].predict_proba([X[i]])[0]
247
+ topic_classes = self.models["topic"].classes_
248
+
249
+ if enforce_constraints:
250
+
251
+ topic_labels = self.le_topic.inverse_transform(topic_classes)
252
+ allowed = set(self.master_index[res["type"]]["topics"])
253
+
254
+ filtered = [
255
+ (label, prob)
256
+ for label, prob in zip(topic_labels, topic_proba)
257
+ if label in allowed
258
+ ]
259
+
260
+ if filtered:
261
+ best_topic, best_prob = max(filtered, key=lambda x: x[1])
262
+ else:
263
+ idx = np.argmax(topic_proba)
264
+ best_topic = topic_labels[idx]
265
+ best_prob = topic_proba[idx]
266
+
267
+ res["topic"] = best_topic
268
+ res["topic_conf"] = float(best_prob)
269
+
270
+ else:
271
+
272
+ idx = np.argmax(topic_proba)
273
+ pred = topic_classes[idx]
274
+
275
+ res["topic"] = self.le_topic.inverse_transform([pred])[0]
276
+ res["topic_conf"] = float(topic_proba[idx])
277
+
278
+ # ---------- INTENT ----------
279
+ intent_proba = self.models["intent"].predict_proba([X[i]])[0]
280
+ intent_classes = self.models["intent"].classes_
281
+
282
+ intent_idx = np.argmax(intent_proba)
283
+ intent_pred = intent_classes[intent_idx]
284
+
285
+ res["intent"] = self.le_intent.inverse_transform([intent_pred])[0]
286
+ res["intent_conf"] = float(intent_proba[intent_idx])
287
+
288
+ if res["type_conf"] < self.threshold["type"]:
289
+ res["type"] = None
290
+ res["type_conf"] = 0
291
+ if res["category_conf"] < self.threshold["category"]:
292
+ res["category"] = None
293
+ res["category_conf"] = 0
294
+ if res["topic_conf"] < self.threshold["topic"]:
295
+ res["topic"] = None
296
+ res["topic_conf"] = 0
297
+ if res["intent_conf"] < self.threshold["intent"]:
298
+ res["intent"] = None
299
+ res["intent_conf"] = 0
300
+
301
+ print("=" * 50)
302
+ print(query)
303
+ print(f"Type: {res['type']}, {res['type_conf']}")
304
+ print(f"Category: {res['category']}, {res['category_conf']}")
305
+ print(f"Topic: {res['topic']}, {res['topic_conf']}")
306
+ print(f"Intent: {res['intent']}, {res['intent_conf']}")
307
+ print("=" * 50)
308
+
309
+ results.append(res)
310
+
311
+ return results
312
+
313
+
314
+ classifier_path = settings.classifier_path / "chatbot_classifier.pkl"
315
+ pipeline = load_pipeline(classifier_path)
316
+
317
+ models = pipeline["models"]
318
+ tfidf = pipeline["tfidf"]
319
+
320
+ le_type = pipeline["le_type"]
321
+ le_category = pipeline["le_category"]
322
+ le_topic = pipeline["le_topic"]
323
+ le_intent = pipeline["le_intent"]
324
+
325
+ MASTER_INDEX = pipeline["MASTER_INDEX"]
326
+ ABBREVIATIONS = pipeline["ABBREVIATIONS"]
327
+
328
+ clf = Classifier(
329
+ tfidf=tfidf,
330
+ abbreviations=ABBREVIATIONS,
331
+ master_index=MASTER_INDEX,
332
+ le_type=le_type,
333
+ le_category=le_category,
334
+ le_topic=le_topic,
335
+ le_intent=le_intent,
336
+ models=models
337
+ )
app/services/document_loader.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.documents import Document
2
+ from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
3
+ from langchain_community.document_loaders import UnstructuredMarkdownLoader
4
+ from pathlib import Path
5
+ from datetime import datetime
6
+ import uuid
7
+ from typing import Optional, List
8
+
9
+
10
+ class document_loader:
11
+ def __init__(self, filepath: Path, glob: str = "*.pdf"):
12
+ self.filepath = filepath
13
+ self.glob = glob
14
+ self.loader = PyPDFLoader
15
+
16
+ # loading services
17
+ def load(self):
18
+ doc_loader = PyPDFLoader(self.filepath)
19
+ return doc_loader.load()
20
+
21
+ def load_md(self):
22
+ return UnstructuredMarkdownLoader(self.filepath).load()
23
+
24
+ def lazy_load(self):
25
+ doc_loader = PyPDFLoader(self.filepath)
26
+ return doc_loader.lazy_load()
27
+
28
+ def load_multiple(self):
29
+ doc_loader = DirectoryLoader(
30
+ self.filepath,
31
+ glob=self.glob,
32
+ loader_cls=PyPDFLoader
33
+ )
34
+ return doc_loader.load()
app/services/file_service.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.core.config import settings
2
+ from typing import List, Optional, Dict, Any
3
+ from pathlib import Path
4
+ from langchain_community.document_loaders import PyMuPDFLoader
5
+ import json
6
+ from datetime import datetime
7
+ from app.utils.document_helpers import build_metadata
8
+ from langchain_core.documents import Document
9
+ import uuid
10
+
11
+ from app.utils.preprocessing import preprocess_filename
12
+
13
+ class FileService:
14
+ """`
15
+ FileService helps manage files and their metadata.
16
+ It stores file information in a central JSON file (e.g., vgec_rag.json).
17
+ """
18
+ def __init__(self):
19
+ self.settings = settings
20
+ # The name of the file where we store metadata
21
+ self.metadata_filename = f"{self.settings.collection_name}.json"
22
+ # The full path to that metadata file in the data folder
23
+ self.metadata_path = self.settings.data_path / self.metadata_filename
24
+
25
+ self.file_storage_path = self.settings.data_path / "documents"
26
+
27
+ # Load existing metadata if it exists, otherwise start fresh
28
+ if self.metadata_path.exists():
29
+ self.records = self.load_metadata()
30
+ else:
31
+ self.records = {}
32
+
33
+ def load_metadata(self) -> Dict[str, Any]:
34
+ """Reads the metadata from the JSON file."""
35
+ try:
36
+ with open(self.metadata_path, 'r', encoding='utf-8') as f:
37
+ return json.load(f)
38
+ except (json.JSONDecodeError, Exception):
39
+ return {}
40
+
41
+ def save_metadata(self):
42
+ """Saves current memory records back to the JSON file."""
43
+ # Ensure the data directory exists
44
+ self.metadata_path.parent.mkdir(parents=True, exist_ok=True)
45
+ with open(self.metadata_path, 'w', encoding='utf-8') as f:
46
+ json.dump(self.records, f, indent=4)
47
+
48
+ def read_file(self, file_path: Path) -> Optional[str]:
49
+ """Reads content from a file and updates the logs."""
50
+ if not file_path.exists():
51
+ return None
52
+
53
+ file_name = preprocess_filename(file_path)
54
+ if file_name.endswith(".pdf"):
55
+ documents = PyMuPDFLoader(file_path).load()
56
+ content = "\n".join([doc.page_content for doc in documents])
57
+ print(content)
58
+ metadata = {
59
+ "page_count": len(documents),
60
+ "ext": "pdf"
61
+ }
62
+ elif file_name.endswith(".txt"):
63
+ split_content_metadata = build_metadata(file_path)
64
+ inbuilt_metadata = split_content_metadata['metadata']
65
+ content = split_content_metadata['content']
66
+ metadata = {
67
+ **inbuilt_metadata,
68
+ "ext": "txt"
69
+ }
70
+ elif file_name.endswith(".md"):
71
+ split_content_metadata = build_metadata(file_path)
72
+ inbuilt_metadata = split_content_metadata['metadata']
73
+ content = split_content_metadata['content']
74
+ metadata = {
75
+ **inbuilt_metadata,
76
+ "ext": "md"
77
+ }
78
+ elif file_name.endswith(".json"):
79
+ with open(file_path, 'r', encoding='utf-8') as f:
80
+ data = json.load(f)
81
+ content = json.dumps(data["content"])
82
+ metadata = {
83
+ "id": data["id"],
84
+ "title": data.get("name", data.get("title", "untitled")),
85
+ "source": data["source"],
86
+ "source_file": file_name or "untitled",
87
+ "created_date": datetime.now().isoformat(),
88
+ "type": data.get("type", "general"),
89
+ "category": data.get("category", "general"),
90
+ "topic": data.get("topic", "general"),
91
+ "ext": "json"
92
+ }
93
+ # file_name
94
+ doc = Document(page_content=content, metadata=metadata)
95
+ doc.metadata["id"] = doc.metadata.get(
96
+ "id",
97
+ str(uuid.uuid4())
98
+ )
99
+ doc.metadata["title"] = doc.metadata.get(
100
+ "title",
101
+ file_name
102
+ )
103
+ doc.metadata["source_file"] = doc.metadata.get(
104
+ "source_file",
105
+ file_name
106
+ )
107
+ doc.metadata["updated_at"] = datetime.now().isoformat()
108
+ doc.metadata["created_at"] = doc.metadata.get(
109
+ "created_at",
110
+ datetime.now().isoformat()
111
+ )
112
+
113
+ # Update logs to reflect that we interacted with this file
114
+ self.update_logs(file_path, metadata)
115
+ return doc
116
+
117
+ def write_file(self, file_path: Path, content: str, metadata: Optional[dict] = None):
118
+ """Writes content to a file and saves its metadata."""
119
+ # Ensure the directory for the file exists
120
+
121
+ filename = preprocess_filename(file_path)
122
+ file_save_path = self.file_storage_path / filename
123
+
124
+ file_save_path.parent.mkdir(parents=True, exist_ok=True)
125
+ if filename.endswith(".pdf"):
126
+ with open(file_path, 'rb') as f:
127
+ content = f.read()
128
+ with open(file_save_path, 'wb') as f:
129
+ f.write(content)
130
+ elif filename.endswith(".txt"):
131
+ with open(file_save_path, 'w', encoding='utf-8') as f:
132
+ f.write(content)
133
+ elif filename.endswith(".md"):
134
+ with open(file_save_path, 'w', encoding='utf-8') as f:
135
+ f.write(content)
136
+ elif filename.endswith(".json"):
137
+ with open(file_save_path, 'w', encoding='utf-8') as f:
138
+ json.dump(content, f, indent=4)
139
+ else:
140
+ with open(file_save_path, 'w', encoding='utf-8') as f:
141
+ f.write(content)
142
+
143
+ # Update the logs with the provided metadata
144
+ self.update_logs(file_save_path, metadata)
145
+
146
+ def update_logs(self, file_path: Path, metadata: Optional[dict] = None):
147
+ """Helper to prepare metadata before saving."""
148
+ file_name = file_path.name
149
+
150
+ # If no metadata is provided, we try to preserve existing
151
+ # metadata or use an empty dict if it's new.
152
+ if metadata is None:
153
+ metadata = self.records.get(file_name, {})
154
+
155
+ self.manage_metadata(file_name, metadata)
156
+
157
+ def manage_metadata(self, file_name: str, metadata: dict):
158
+ """Updates the internal dictionary and saves it to the disk."""
159
+ self.records[file_name] = metadata
160
+ self.save_metadata()
161
+
162
+ def patch_metadata(self, file_path: Path, metadata: dict):
163
+ file_name = file_path.name
164
+ original_metadata = self.records.get(file_name, {})
165
+ self.manage_metadata(
166
+ file_name= file_name,
167
+ metadata= {
168
+ **original_metadata,
169
+ **metadata
170
+ }
171
+ )
172
+
173
+ def get_records(self) -> Dict[str, Any]:
174
+ """Returns all stored metadata records."""
175
+ return self.records
176
+
177
+ def get_record(self, file_name: str) -> Optional[Dict[str, Any]]:
178
+ """Returns metadata for a specific file."""
179
+ return self.records.get(file_name)
180
+
181
+ def delete_record(self, file_name: str) -> bool:
182
+ """Removes a metadata record from the JSON file."""
183
+ if file_name in self.records:
184
+ del self.records[file_name]
185
+ self.save_metadata()
186
+ return True
187
+ return False
188
+
189
+ def update_record(self, file_name: str, metadata: dict) -> bool:
190
+ """Updates the metadata for an existing record."""
191
+ if file_name in self.records:
192
+ self.records[file_name] = metadata
193
+ self.save_metadata()
194
+ return True
195
+ return False
196
+
197
+ # Initialize a globally accessible service instance
198
+ file_service = FileService()
app/services/filter-demo ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ANCHORS = {
2
+ "type": {
3
+ "department": "department academic branch faculty courses engineering science",
4
+ "facility": "facility building campus lab central infrastructure",
5
+ "service": "service office administration student support section",
6
+ "hostel": "hostel dormitory residence accommodation warden mess",
7
+ "library": "library books journal reading catalog lending",
8
+ "placement": "placement recruitment company offer package career tnp",
9
+ "research": "research patent publication funded grant scholar",
10
+ "club": "club society committee nss ncc ieee cultural extracurricular",
11
+ "admission": "admission enrollment intake eligibility registration criteria",
12
+ },
13
+
14
+ "category": {
15
+ # Keep it short — name first, then 4-5 unique discriminative terms
16
+ "applied_mechanics": "applied mechanics AM statics dynamics stress strain",
17
+ "chemical": "chemical engineering ChE chemistry reaction process plant",
18
+ "civil": "civil engineering CE construction structural geotechnical survey",
19
+ "computer": "computer engineering CE hardware microprocessor VLSI embedded digital",
20
+ "cse_ds": "computer science CSE data science AI machine learning neural network",
21
+ "electronics_comm": "electronics communication ECE signal RF wireless antenna analog",
22
+ "electronics_inst": "electronics instrumentation EI biomedical sensors transducer measurement",
23
+ "electrical": "electrical engineering EE power motor transformer transmission drives",
24
+ "it": "information technology IT software ERP cloud database devops",
25
+ "ict": "information communication technology ICT telecom fiber networking protocol",
26
+ "instrumentation": "instrumentation control IC PLC SCADA automation feedback industrial",
27
+ "mechanical": "mechanical engineering ME thermal manufacturing CAD CAM machining fluid",
28
+ "power_electronics": "power electronics PE converter inverter MOSFET IGBT rectifier chopper",
29
+ "science_humanities": "science humanities SH physics mathematics english communication foundation",
30
+
31
+ "transport": "transport bus route commute shuttle pickup drop campus travel",
32
+ "finance": "finance fees tuition payment charges fine scholarship due",
33
+ "medical": "medical health doctor dispensary clinic nurse first aid",
34
+ "sports": "sports ground gym cricket football badminton court fitness",
35
+ "grievance": "grievance complaint harassment redressal scst women discrimination",
36
+ "forms": "forms bonafide certificate download application NOC document",
37
+
38
+ "ug": "undergraduate UG BE BTech bachelor four year first year gujcet jee",
39
+ "pg": "postgraduate PG ME MTech MBA MCA master two year gate",
40
+
41
+ "tnp": "tnp training placement cell campus drive offer letter coordinator",
42
+ "patent": "patent intellectual property invention filed granted rights",
43
+ "ssip": "ssip startup student innovation gujarat seed funding incubation",
44
+ "funded": "funded grant DST DRDO ISRO sponsored external research project",
45
+ "publication": "publication journal paper conference scopus SCI citation article",
46
+
47
+ "nss": "nss national service scheme volunteer blood donation community camp",
48
+ "ncc": "ncc national cadet corps army navy air force drill parade",
49
+ "ieee": "ieee electrical electronics engineers student chapter symposium",
50
+ "iei": "iei institution engineers india professional chapter membership",
51
+ "adventure": "adventure trekking hiking outdoor camping expedition nature club",
52
+ "women_cell": "women cell WDC empowerment gender equality ladies committee",
53
+
54
+ "principal": "principal director head institution chairman governing management",
55
+ "accreditation": "accreditation NBA NIRF NAAC AICTE GTU ranking grade approval",
56
+ "awards": "awards achievements recognition felicitation distinction honor trophy",
57
+ },
58
+
59
+ "topic": {
60
+ "faculty": "faculty professor lecturer HOD staff designation qualification phd",
61
+ "lab": "lab laboratory practical equipment instruments apparatus experiment",
62
+ "syllabus": "syllabus curriculum subjects units topics chapters semester GTU",
63
+ "timetable": "timetable class schedule period slot timing routine batch",
64
+ "event": "event fest hackathon seminar competition workshop cultural program",
65
+ "project": "project final year mini SIH capstone dissertation submission",
66
+ "virtual_tour": "virtual tour 360 view online walkthrough campus interactive",
67
+ "notice": "notice notification announcement circular bulletin update board",
68
+ "fees": "fees tuition charges structure breakdown payment due scholarship",
69
+ "rules": "rules regulations discipline policy conduct code guidelines norms",
70
+ "facilities": "facilities amenities wifi mess canteen gym recreation available",
71
+ "contact": "contact phone email address reach person call office",
72
+ "process": "process procedure steps apply method eligibility criteria workflow",
73
+ "document": "document certificate bonafide migration TC attestation official",
74
+ "route": "route bus stop pickup drop timing schedule commute point",
75
+ "stats": "statistics total number count figures record percentage ratio data",
76
+ "calendar": "calendar academic dates holidays exam deadlines semester schedule",
77
+ "vision": "vision mission goals objectives values purpose motto statement",
78
+ "induction": "induction orientation welcome freshman new student speaker activity",
79
+ },
80
+
81
+ "intent": {
82
+ "list": "list all show every what are enumerate display available options",
83
+ "count": "how many total count number quantity strength size",
84
+ "detail": "what is explain describe tell me about information overview",
85
+ "process": "how to apply steps procedure guide method approach eligibility",
86
+ "greeting": "hello hi hey good morning good evening namaste greetings",
87
+ },
88
+ }
89
+ ABBREVIATIONS = {
90
+ "ce": "computer engineering",
91
+ "cse": "computer science engineering",
92
+ "ds": "data science",
93
+ "it": "information technology",
94
+ "ict": "information communication technology",
95
+ "ece": "electronics communication engineering",
96
+ "ei": "electronics instrumentation engineering",
97
+ "ic": "instrumentation control",
98
+ "ee": "electrical engineering",
99
+ "pe": "power electronics",
100
+ "me": "mechanical engineering",
101
+ "am": "applied mechanics",
102
+ }
103
+ MASTER_INDEX = {
104
+ "department": {
105
+ "categories": [
106
+ "applied_mechanics", "chemical", "civil", "computer", "cse_ds",
107
+ "electronics_comm", "electronics_inst", "electrical", "it", "ict",
108
+ "instrumentation", "mechanical", "power_electronics", "science_humanities"
109
+ ],
110
+ "topics": [
111
+ "faculty", "lab", "syllabus", "timetable", "event",
112
+ "project", "virtual_tour", "notice", "contact", "stats"
113
+ ]
114
+ },
115
+ "facility": {"categories": [None], "topics": ["lab", "facilities", "contact", "virtual_tour"]},
116
+ "service": {
117
+ "categories": ["transport", "finance", "medical", "sports", "grievance", "forms", "principal", "accreditation", "awards"],
118
+ "topics": ["fees", "rules", "process", "document", "route", "contact", "calendar", "stats", "vision", "induction"]
119
+ },
120
+ "hostel": {"categories": [None], "topics": ["fees", "rules", "facilities", "contact"]},
121
+ "library": {"categories": [None], "topics": ["fees", "rules", "facilities", "contact", "calendar", "document"]},
122
+ "placement": {"categories": ["tnp"], "topics": ["stats", "process", "contact", "event"]},
123
+ "research": {"categories": ["patent", "ssip", "funded", "publication"], "topics": ["stats", "project", "process", "contact"]},
124
+ "club": {"categories": ["nss", "ncc", "ieee", "iei", "adventure", "women_cell"], "topics": ["event", "contact", "process", "stats", "notice"]},
125
+ "admission": {"categories": ["ug", "pg"], "topics": ["process", "fees", "document", "result", "calendar", "contact"]}
126
+ }
127
+
128
+ from sentence_transformers import SentenceTransformer
129
+ from sklearn.metrics.pairwise import cosine_similarity
130
+ import numpy as np
131
+ import re
132
+ from app.core.config import settings
133
+ from typing import Optional
134
+
135
+ class FilterClassifier:
136
+ def __init__(self, threshold: Optional[float] = None):
137
+ self.anchor_embeddings = {}
138
+ self.model = SentenceTransformer(str(settings.embeddings_path / "bge-small"))
139
+ self._build_anchor_embeddings()
140
+ self.threshold = threshold if threshold is not None else 0.4
141
+
142
+ def _build_anchor_embeddings(self):
143
+ if self.anchor_embeddings:
144
+ return
145
+ for domain, anchors in ANCHORS.items():
146
+ self.anchor_embeddings[domain] = {}
147
+ for label, text in anchors.items():
148
+ self.anchor_embeddings[domain][label] = self.model.encode(text)
149
+
150
+ def handle_abbreviations(self, query: str) -> str:
151
+ for abbr, full_form in ABBREVIATIONS.items():
152
+ query = query.replace(abbr, full_form)
153
+ return query
154
+
155
+
156
+ def classify(self, query: str) -> dict:
157
+ query = self.handle_abbreviations(query)
158
+ query_emb = self.model.encode(query)
159
+
160
+ result = {"type":None,"category":None,"topic":None,"year":None,"intent":None}
161
+
162
+ for field, value_embeddings in self.anchor_embeddings.items ():
163
+ scores = {
164
+ val: cosine_similarity([query_emb], [emb])[0][0]
165
+ for val, emb in value_embeddings.items()
166
+ }
167
+
168
+ print(scores, max(scores, key = scores.get))
169
+ best_val = max(scores, key=scores.get)
170
+ best_score = scores[best_val]
171
+ print(best_val, best_score)
172
+
173
+ # print(field, result[field])
174
+
175
+ # Only accept if confidence is above threshold
176
+ if best_score > self.threshold:
177
+ result[field] = best_val
178
+
179
+ year = re.search(r"\b(20\d{2})\b", query)
180
+ result["year"] = int(year.group()) if year else None
181
+
182
+
183
+ if result["type"] is not None:
184
+ if (result["category"] is None
185
+ or result["category"] not in MASTER_INDEX[result["type"]]["categories"]):
186
+ result["category"] = None
187
+
188
+ if (result["topic"] is None
189
+ or result["topic"] not in MASTER_INDEX[result["type"]]["topics"]):
190
+ result["topic"] = None
191
+ else:
192
+ result["category"] = None
193
+ result["topic"] = None
194
+
195
+ return result
196
+
197
+ classifier = FilterClassifier()
app/services/filter_classifier copy.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ANCHORS = {
3
+
4
+ # ─────────────────────────────────────────────────
5
+ # TYPE — discriminative, non-overlapping
6
+ # ─────────────────────────────────────────────────
7
+ "type": {
8
+ "department": "department academic branch division faculty staff courses offered semester",
9
+ "facility": "facility central infrastructure campus physical space building block floor room",
10
+ "service": "service administrative office cell section support helpdesk student welfare",
11
+ "hostel": "hostel dormitory residence hall accommodation mess warden boarding lodging",
12
+ "library": "library books journals reading room catalog issue return lending periodicals",
13
+ "placement": "placement recruitment hired campus drive offer letter company package lpa tnp",
14
+ "research": "research innovation funded grant patent publication lab project scholar phd",
15
+ "club": "club society student chapter committee extracurricular nss ncc ieee cultural sports",
16
+ "admission": "admission enrollment application intake registration eligibility criteria merit joining",
17
+ },
18
+
19
+ # ─────────────────────────────────────────────────
20
+ # CATEGORY — must contain the obvious name first
21
+ # ─────────────────────────────────────────────────
22
+ "category": {
23
+
24
+ # ── departments ──
25
+ "applied_mechanics": "applied mechanics AM statics dynamics solid mechanics fluid mechanics stress strain deformation",
26
+ "chemical": "chemical engineering ChE chemistry process plant reaction distillation thermodynamics petrochemical",
27
+ "civil": "civil engineering CE construction structural geotechnical surveying transportation concrete roads bridges",
28
+ "computer": "computer engineering CE computer hardware microprocessor VLSI embedded systems digital circuits processor chip architecture",
29
+ "cse_ds": "computer science CSE data science DS artificial intelligence machine learning neural network deep learning NLP analytics algorithm",
30
+ "electronics_comm": "electronics communication ECE EC signal processing analog RF wireless antenna microwave telecommunication",
31
+ "electronics_inst": "electronics instrumentation EI biomedical sensors LVDT transducer measurement calibration control systems",
32
+ "electrical": "electrical engineering EE power systems generation transmission distribution motor transformer drives induction synchronous",
33
+ "it": "information technology IT software development ERP cloud computing database devops web application enterprise",
34
+ "ict": "information communication technology ICT telecom networking fiber optic protocol bandwidth routing switching internet",
35
+ "instrumentation": "instrumentation control IC PLC SCADA automation process control feedback loop industrial plant",
36
+ "mechanical": "mechanical engineering ME mech thermal fluid manufacturing machining CAD CAM turbine heat transfer production",
37
+ "power_electronics": "power electronics PE converter inverter MOSFET IGBT rectifier chopper switching drives variable frequency",
38
+ "science_humanities": "science humanities SH physics chemistry mathematics english communication basic science applied science foundation",
39
+
40
+ # ── service ──
41
+ "transport": "transport bus route commute shuttle vehicle pickup drop timing campus travel conveyance",
42
+ "finance": "finance fees tuition payment semester charges fine scholarship refund due bank challan",
43
+ "medical": "medical health doctor dispensary clinic first aid nurse campus sick injury treatment",
44
+ "sports": "sports ground gym fitness cricket football badminton volleyball court track field athletics",
45
+ "grievance": "grievance complaint redressal harassment scst obc women discrimination appeal committee inquiry",
46
+ "forms": "forms download bonafide certificate application document tc migration no objection NOC",
47
+
48
+ # ── admission ──
49
+ "ug": "undergraduate UG BE BTech bachelor four year degree engineering first year admission gujcet jee lateral",
50
+ "pg": "postgraduate PG ME MTech MBA MCA master two year degree admission gate mat entrance",
51
+
52
+ # ── placement ──
53
+ "tnp": "tnp training placement cell campus recruitment company drive package offer letter placement officer coordinator",
54
+
55
+ # ── research ──
56
+ "patent": "patent intellectual property IP invention filed granted innovation protection rights",
57
+ "ssip": "ssip startup student innovation project gujarat government seed funding incubation entrepreneurship",
58
+ "funded": "funded sponsored externally grant DST ISRO DRDO government industry collaborative research project",
59
+ "publication": "publication journal paper conference proceedings scopus SCI research article citation author",
60
+
61
+ # ── club ──
62
+ "nss": "nss national service scheme volunteer social community service blood donation camp awareness",
63
+ "ncc": "ncc national cadet corps army navy air force cadet drill parade certificate b c",
64
+ "ieee": "ieee institute electrical electronics engineers student chapter technical symposium paper",
65
+ "iei": "iei institution engineers india professional body student chapter membership",
66
+ "adventure": "adventure advanature nature trekking outdoor hiking camping expedition rock climbing club",
67
+ "women_cell": "women development cell WDC empowerment gender equality ladies committee harassment redressal",
68
+
69
+ # ── administration ──
70
+ "principal": "principal director head of institution management chairman governing body top administration",
71
+ "accreditation": "accreditation NBA NIRF NAAC AICTE GTU affiliation ranking approval grade score",
72
+ "awards": "awards achievements recognition felicitation distinction honor national state rank trophy",
73
+ },
74
+
75
+ # ─────────────────────────────────────────────────
76
+ # TOPIC — aspects, clearly separated from each other
77
+ # ─────────────────────────────────────────────────
78
+ "topic": {
79
+ "faculty": "faculty professor lecturer instructor assistant professor associate professor HOD teaching staff designation qualification phd",
80
+ "lab": "laboratory lab practical experiment equipment instruments workshop hands-on setup apparatus bench",
81
+ "syllabus": "syllabus curriculum course content subjects units topics chapters semester wise GTU prescribed",
82
+ "timetable": "timetable class schedule routine period slot lecture timing weekly daily batch division",
83
+ "event": "event events fest hackathon seminar workshop competition cultural technical program organized upcoming",
84
+ "project": "project final year mini SIH capstone student work dissertation major submission",
85
+ "virtual_tour": "virtual tour 360 degree view online walkthrough campus room infrastructure interactive map",
86
+ "notice": "notice notification announcement circular update bulletin board recent latest information",
87
+ "fees": "fees tuition charges amount structure semester breakdown fine late scholarship payment due",
88
+ "rules": "rules regulations discipline policy code conduct guidelines norms behaviour dress restriction",
89
+ "facilities": "facilities amenities available infrastructure wifi internet mess canteen gym recreation services provided",
90
+ "contact": "contact phone number email address reach person call department office location",
91
+ "process": "process procedure steps how to apply method eligibility criteria requirement workflow sequence",
92
+ "document": "document certificate bonafide migration leaving tc attestation verification required official",
93
+ "route": "route bus stop timing pickup drop point schedule commute map destination",
94
+ "stats": "statistics data figures record total number count percentage ratio achievement placement pass",
95
+ "calendar": "calendar academic dates holidays exam schedule important deadlines events semester start end",
96
+ "vision": "vision mission goals objectives values purpose statement motto philosophy aim",
97
+ "induction": "induction orientation welcome program new student freshman speaker activity schedule",
98
+ },
99
+
100
+ # ─────────────────────────────────────────────────
101
+ # INTENT — must be semantically far apart
102
+ # ─────────────────────────────────────────────────
103
+ "intent": {
104
+ "list": "list all show every what are all available options display give me all enumerate",
105
+ "count": "how many total count number quantity how much strength size",
106
+ "detail": "what is explain describe tell me about information overview summary background",
107
+ "process": "how to apply steps procedure method way guide eligibility criteria approach",
108
+ "greeting": "hello hi hey good morning good afternoon good evening how are you namaste greetings",
109
+ },
110
+ }
111
+ ABBREVIATIONS = {
112
+ "ce": "computer engineering",
113
+ "cse": "computer science engineering",
114
+ "ds": "data science",
115
+ "it": "information technology",
116
+ "ict": "information communication technology",
117
+ "ece": "electronics communication engineering",
118
+ "ei": "electronics instrumentation engineering",
119
+ "ic": "instrumentation control",
120
+ "ee": "electrical engineering",
121
+ "pe": "power electronics",
122
+ "me": "mechanical engineering",
123
+ "am": "applied mechanics",
124
+ "che": "chemical engineering",
125
+ "ch": "chemical",
126
+ "ce": "civil engineering",
127
+ "ug": "undergraduate",
128
+ "pg": "postgraduate",
129
+ "be": "bachelor of engineering",
130
+ "btech": "bachelor of technology",
131
+ "me": "master of engineering",
132
+ "mtech": "master of technology",
133
+ "mba": "master of business administration",
134
+ "mca": "master of computer applications",
135
+ "tnp": "training and placement",
136
+ "nss": "national service scheme",
137
+ "ncc": "national cadet corps",
138
+ "ieee": "institute of electrical and electronics engineers",
139
+ "iei": "institution of engineers india",
140
+ "wdc": "women development cell",
141
+ "sip": "student innovation project",
142
+ "gtu": "gujarat technological university",
143
+ "nba": "national board of accreditation",
144
+ "naac": "national assessment and accreditation council",
145
+ "nirf": "national institutional ranking framework",
146
+ "aicte": "all india council for technical education",
147
+ "drdo": "defence research and development organisation",
148
+ "isro": "indian space research organisation",
149
+ "dst": "department of science and technology",
150
+ "sih": "smart india hackathon",
151
+ "lpa": "lakhs per annum",
152
+ "noc": "no objection certificate",
153
+ "tc": "transfer certificate",
154
+ "hod": "head of department",
155
+ "phd": "doctor of philosophy",
156
+ "scada": "supervisory control and data acquisition",
157
+ "plc": "programmable logic controller",
158
+ "lvdt": "linear variable differential transformer",
159
+ "mosfet": "metal oxide semiconductor field effect transistor",
160
+ "igbt": "insulated gate bipolar transistor",
161
+ "vlsi": "very large scale integration",
162
+ "cad": "computer aided design",
163
+ "cam": "computer aided manufacturing",
164
+ "erp": "enterprise resource planning",
165
+ "rf": "radio frequency",
166
+ "nlp": "natural language processing",
167
+ "ai": "artificial intelligence",
168
+ "ml": "machine learning",
169
+ "scopus": "scopus",
170
+ "sci": "science citation index",
171
+ "ip": "intellectual property",
172
+ }
173
+ MASTER_INDEX = {
174
+ "department": {
175
+ "categories": [
176
+ "applied_mechanics", "chemical", "civil", "computer", "cse_ds",
177
+ "electronics_comm", "electronics_inst", "electrical", "it", "ict",
178
+ "instrumentation", "mechanical", "power_electronics", "science_humanities"
179
+ ],
180
+ "topics": [
181
+ "faculty", "lab", "syllabus", "timetable", "event",
182
+ "project", "virtual_tour", "notice", "contact"
183
+ ]
184
+ },
185
+ "facility": {"categories": [None], "topics": ["lab", "facilities", "contact", "virtual_tour"]},
186
+ "service": {
187
+ "categories": ["transport", "finance", "medical", "sports", "grievance", "forms", "principal", "accreditation", "awards"],
188
+ "topics": ["fees", "rules", "process", "document", "route", "contact", "calendar", "stats", "vision", "induction"]
189
+ },
190
+ "hostel": {"categories": [None], "topics": ["fees", "rules", "facilities", "contact"]},
191
+ "library": {"categories": [None], "topics": ["fees", "rules", "facilities", "contact", "calendar", "document"]},
192
+ "placement": {"categories": ["tnp"], "topics": ["stats", "process", "contact", "event"]},
193
+ "research": {"categories": ["patent", "ssip", "funded", "publication"], "topics": ["stats", "project", "process", "contact"]},
194
+ "club": {"categories": ["nss", "ncc", "ieee", "iei", "adventure", "women_cell"], "topics": ["event", "contact", "process", "stats", "notice"]},
195
+ "admission": {"categories": ["ug", "pg"], "topics": ["process", "fees", "document", "result", "calendar", "contact"]}
196
+ }
197
+
198
+ from sentence_transformers import SentenceTransformer
199
+ from sklearn.metrics.pairwise import cosine_similarity
200
+ import numpy as np
201
+ import re
202
+ from app.core.config import settings
203
+ from typing import Optional
204
+ from rank_bm25 import BM25Okapi
205
+
206
+ FIELD_THRESHOLDS = {
207
+ "type": 0.25, # Was 0.5 - too high for embedding-heavy field
208
+ "category": 0.5, # Keep - BM25-heavy works well
209
+ "topic": 0.4, # Was 0.5 - slight reduction
210
+ "intent": 0.5, # Keep - usually clear signals
211
+ }
212
+
213
+
214
+ FIELD_WEIGHTS = {
215
+ "type": (0.6, 0.4), # embedding-heavy — semantic
216
+ "category": (0.35, 0.65), # BM25-heavy — exact names matter most
217
+ "topic": (0.55, 0.45),
218
+ "intent": (0.7, 0.3), # embedding-heavy — semantic intent
219
+ }
220
+
221
+ class FilterClassifier:
222
+ def __init__(self, threshold=None):
223
+ self.model = SentenceTransformer(str(settings.embeddings_path / "bge-small"))
224
+ self.anchor_embeddings = {}
225
+ self.bm25_classifiers = {}
226
+ self.anchor_keys = {}
227
+ self._build_anchor_embeddings()
228
+ self._build_bm25()
229
+ self.threshold = threshold
230
+
231
+ def _build_anchor_embeddings(self):
232
+ for field, anchors in ANCHORS.items():
233
+ self.anchor_embeddings[field] = {
234
+ label: self.model.encode(f"{label} {text}")
235
+ for label, text in anchors.items()
236
+ }
237
+
238
+ def _build_bm25(self):
239
+ for field, anchors in ANCHORS.items():
240
+ keys = list(anchors.keys())
241
+ docs = [f"{label} {text}".lower().split() for label, text in anchors.items()]
242
+ self.anchor_keys[field] = keys
243
+ self.bm25_classifiers[field] = BM25Okapi(docs)
244
+
245
+ def handle_abbreviations(self, query: str) -> str:
246
+ tokens = query.lower().split()
247
+ expanded = [ABBREVIATIONS.get(t, t) for t in tokens]
248
+ return " ".join(expanded)
249
+
250
+ def preprocess(self, query: str) -> str:
251
+ # Remove punctuation except spaces
252
+ query = re.sub(r'[^\w\s]', ' ', query.lower())
253
+ # Handle multiple spaces
254
+ query = re.sub(r'\s+', ' ', query).strip()
255
+ return query
256
+
257
+ def classify(self, query: str) -> dict:
258
+ query = self.handle_abbreviations(query)
259
+ query = self.preprocess(query)
260
+ query_emb = self.model.encode(query)
261
+ tokenized = query.lower().split()
262
+
263
+ result = {"type": None, "category": None, "topic": None, "year": None, "intent": None}
264
+
265
+ # 1. Classify Primary Fields (Type and Intent)
266
+ result["type"] = self._get_best_match("type", query_emb, tokenized)
267
+ result["intent"] = self._get_best_match("intent", query_emb, tokenized)
268
+
269
+ # 2. Extract Year (Independent)
270
+ year_match = re.search(r"\b(20\d{2})\b", query)
271
+ result["year"] = int(year_match.group()) if year_match else None
272
+
273
+ # 3. Cascading Classification for Category and Topic
274
+ if result["type"]:
275
+ valid_config = MASTER_INDEX.get(result["type"], {})
276
+
277
+ # Filtered Category
278
+ valid_cats = valid_config.get("categories", [])
279
+ if valid_cats and valid_cats != [None]:
280
+ result["category"] = self._get_best_match("category", query_emb, tokenized, allowed_labels=valid_cats)
281
+
282
+ # Filtered Topic
283
+ valid_topics = valid_config.get("topics", [])
284
+ if valid_topics:
285
+ result["topic"] = self._get_best_match("topic", query_emb, tokenized, allowed_labels=valid_topics)
286
+
287
+ return result
288
+
289
+ def _get_best_match(self, field: str, query_emb: np.ndarray, tokenized: list, allowed_labels: list = None) -> Optional[str]:
290
+ """Helper to find the best match for a field, optionally restricted to a subset of labels."""
291
+ keys = self.anchor_keys[field]
292
+ value_embeddings = self.anchor_embeddings[field]
293
+
294
+ # If restricted, only consider allowed labels
295
+ target_keys = allowed_labels if allowed_labels else keys
296
+
297
+ # 1. Embedding scores
298
+ emb_scores = {
299
+ val: cosine_similarity([query_emb], [value_embeddings[val]])[0][0]
300
+ for val in target_keys if val in value_embeddings
301
+ }
302
+
303
+ # 2. BM25 scores (subset aware)
304
+ raw_bm25 = self.bm25_classifiers[field].get_scores(tokenized)
305
+ global_bm25_max = max(raw_bm25) if len(raw_bm25) > 0 and max(raw_bm25) > 0 else 1
306
+
307
+ # We need to map global BM25 scores to our subset
308
+ subset_bm25 = {}
309
+ for val in target_keys:
310
+ if val in keys:
311
+ idx = keys.index(val)
312
+ subset_bm25[val] = raw_bm25[idx]
313
+
314
+ # Normalize BM25 scores using the GLOBAL maximum to keep perspective
315
+ normalized_bm25 = {v: s / global_bm25_max for v, s in subset_bm25.items()}
316
+
317
+ # 3. Combine with Weights
318
+ emb_w, bm25_w = FIELD_WEIGHTS[field]
319
+ combined = {
320
+ val: (emb_w * emb_scores.get(val, 0)) + (bm25_w * normalized_bm25.get(val, 0))
321
+ for val in target_keys
322
+ }
323
+
324
+ if not combined:
325
+ return None
326
+
327
+ best_val = max(combined, key=combined.get)
328
+ best_score = combined[best_val]
329
+
330
+ print(f"{field}: {best_val} | combined={best_score:.3f} emb={emb_scores.get(best_val,0):.3f} bm25={normalized_bm25.get(best_val,0):.3f}")
331
+
332
+ return best_val if best_score > FIELD_THRESHOLDS[field] else None
333
+
334
+ classifier = FilterClassifier()
app/services/filter_classifier.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ANCHORS = {
3
+
4
+ # ─────────────────────────────────────────────────
5
+ # TYPE — written as exemplar descriptions so the
6
+ # SentenceTransformer embeddings are maximally
7
+ # separated. Each entry must be clearly distinct.
8
+ # ─────────────────────────────────────────────────
9
+ "type": {
10
+ "department": (
11
+ "Which engineering department offers this course? "
12
+ "Tell me about the academic branch, its faculty, subjects, labs, syllabus and semester structure."
13
+ ),
14
+ "facility": (
15
+ "Where is this campus facility located? "
16
+ "Show me the central infrastructure, physical spaces, buildings, blocks, floors and rooms on campus."
17
+ ),
18
+ "service": (
19
+ "How do I use this administrative service? "
20
+ "I need help from an office, cell, section or support desk for student welfare."
21
+ ),
22
+ "hostel": (
23
+ "What are the hostel rules and accommodation details? "
24
+ "I want to know about the dormitory, residence hall, mess, warden and boarding facilities."
25
+ ),
26
+ "library": (
27
+ "How do I borrow books from the library? "
28
+ "Tell me about the reading room, book catalog, journal issue, return and lending system."
29
+ ),
30
+ "placement": (
31
+ "Which companies came for campus placement this year? "
32
+ "I want to know about recruitment drives, offer letters, packages, TNP cell and placement statistics."
33
+ ),
34
+ "research": (
35
+ "How do I apply for a funded research project? "
36
+ "Tell me about grants, patents, publications, PhD scholars and innovation at the institute."
37
+ ),
38
+ "club": (
39
+ "How do I join a student club or society? "
40
+ "Tell me about extracurricular chapters like NSS, NCC, IEEE and cultural or sports committees."
41
+ ),
42
+ "admission": (
43
+ "What is the admission process to join this college? "
44
+ "I want to know about enrollment, eligibility criteria, merit list, application and registration."
45
+ ),
46
+ },
47
+
48
+ # ─────────────────────────────────────────────────
49
+ # CATEGORY — exemplar sentences, grouped by parent
50
+ # type. Unique, distinctive keywords embedded in
51
+ # natural sentences to prevent cross-category leaks.
52
+ # ─────────────────────────────────────────────────
53
+ "category": {
54
+
55
+ # ── departments ──
56
+ "applied_mechanics": (
57
+ "The Applied Mechanics department covers statics, dynamics, solid mechanics, "
58
+ "fluid mechanics, stress-strain analysis and structural deformation."
59
+ ),
60
+ "chemical": (
61
+ "The Chemical Engineering department covers chemistry, thermodynamics, reaction engineering, "
62
+ "distillation, process plant design and petrochemical processes."
63
+ ),
64
+ "civil": (
65
+ "The Civil Engineering department covers structural engineering, construction, "
66
+ "geotechnical surveying, transportation, concrete design, roads and bridges."
67
+ ),
68
+ "computer": (
69
+ "The Computer Engineering department covers computer hardware, microprocessors, "
70
+ "VLSI design, embedded systems, digital circuits and chip architecture."
71
+ ),
72
+ "cse_ds": (
73
+ "The Computer Science and Data Science department covers algorithms, artificial intelligence, "
74
+ "machine learning, neural networks, deep learning, NLP and data analytics."
75
+ ),
76
+ "electronics_comm": (
77
+ "The Electronics and Communication Engineering department covers signal processing, "
78
+ "analog circuits, RF systems, wireless communication, antennas and microwave technology."
79
+ ),
80
+ "electronics_inst": (
81
+ "The Electronics and Instrumentation Engineering department covers sensors, transducers, "
82
+ "LVDT, biomedical instrumentation, measurement, calibration and control systems."
83
+ ),
84
+ "electrical": (
85
+ "The Electrical Engineering department covers power systems, generation, transmission, "
86
+ "distribution, motors, transformers, induction machines and synchronous drives."
87
+ ),
88
+ "it": (
89
+ "The Information Technology department covers software development, ERP systems, "
90
+ "cloud computing, databases, DevOps, web applications and enterprise solutions."
91
+ ),
92
+ "ict": (
93
+ "The Information and Communication Technology department covers telecom, networking, "
94
+ "fiber optics, routing, switching, bandwidth and internet protocols."
95
+ ),
96
+ "instrumentation": (
97
+ "The Instrumentation and Control department covers PLC, SCADA, automation, "
98
+ "process control, feedback loops and industrial control plant systems."
99
+ ),
100
+ "mechanical": (
101
+ "The Mechanical Engineering department covers manufacturing, machining, thermal engineering, "
102
+ "fluid mechanics, CAD CAM design, turbines, heat transfer and production."
103
+ ),
104
+ "power_electronics": (
105
+ "The Power Electronics department covers converters, inverters, MOSFETs, IGBTs, "
106
+ "rectifiers, choppers, variable frequency drives and switching circuits."
107
+ ),
108
+ "science_humanities": (
109
+ "The Science and Humanities department covers applied physics, chemistry, mathematics, "
110
+ "English communication and basic foundation sciences for engineering."
111
+ ),
112
+
113
+ # ── service categories ──
114
+ "transport": (
115
+ "The college transport service operates bus routes for commuting. "
116
+ "I want to know the bus stop, timing, pickup and drop schedule and shuttle conveyance."
117
+ ),
118
+ "finance": (
119
+ "The finance office handles tuition fees, semester payment, fine, scholarship, refund and bank challan."
120
+ ),
121
+ "medical": (
122
+ "The campus medical facility has a doctor, dispensary and clinic for first aid, "
123
+ "nursing and student health treatment."
124
+ ),
125
+ "sports": (
126
+ "The sports facility has grounds, a gym and courts for cricket, football, "
127
+ "badminton, volleyball, athletics and fitness activities."
128
+ ),
129
+ "grievance": (
130
+ "The grievance redressal cell handles student complaints about harassment, "
131
+ "discrimination based on SC ST OBC, gender issues and appeal inquiries."
132
+ ),
133
+ "forms": (
134
+ "I need to download a bonafide certificate, NOC, migration form, TC or no-objection document from the college."
135
+ ),
136
+
137
+ # ── admission categories ──
138
+ "ug": (
139
+ "Undergraduate BE BTech bachelor degree admission through GUJCET JEE. "
140
+ "Four year engineering program, first year intake, lateral entry eligibility."
141
+ ),
142
+ "pg": (
143
+ "Postgraduate ME MTech MBA MCA master degree admission through GATE MAT entrance exam. "
144
+ "Two year program eligibility and registration process."
145
+ ),
146
+
147
+ # ── placement categories ──
148
+ "tnp": (
149
+ "The Training and Placement cell organizes campus recruitment drives. "
150
+ "Companies visit for interviews, offer letters and placement packages are coordinated by the placement officer."
151
+ ),
152
+
153
+ # ── research categories ──
154
+ "patent": (
155
+ "A patent was filed for a new invention idea. "
156
+ "Intellectual property protection, granted innovation rights for student or faculty work."
157
+ ),
158
+ "ssip": (
159
+ "The SSIP scheme funds student startup and innovation projects. "
160
+ "Gujarat government provides seed money, incubation and entrepreneurship support."
161
+ ),
162
+ "funded": (
163
+ "This is a sponsored research project funded by DST, ISRO or DRDO. "
164
+ "External grant, industry collaboration, government funded research work."
165
+ ),
166
+ "publication": (
167
+ "A research paper was published in a Scopus or SCI journal. "
168
+ "Conference proceedings, citation, article authorship and research publication record."
169
+ ),
170
+
171
+ # ── club categories ──
172
+ "nss": (
173
+ "NSS National Service Scheme organizes volunteer activities, blood donation camps, "
174
+ "social awareness programs and community service for students."
175
+ ),
176
+ "ncc": (
177
+ "NCC National Cadet Corps trains cadets in army, navy and air force drills, "
178
+ "parade, and issues B and C certificates."
179
+ ),
180
+ "ieee": (
181
+ "The IEEE student chapter organizes technical symposiums, paper presentations "
182
+ "and seminars for electrical and electronics engineering students."
183
+ ),
184
+ "iei": (
185
+ "The IEI Institution of Engineers India student chapter is a professional body "
186
+ "offering membership and extracurricular technical activities."
187
+ ),
188
+ "adventure": (
189
+ "The adventure club organizes trekking, hiking, camping, outdoor expeditions "
190
+ "and rock climbing activities in nature."
191
+ ),
192
+ "women_cell": (
193
+ "The Women Development Cell promotes gender equality, ladies empowerment, "
194
+ "and handles harassment redressal for female students and staff."
195
+ ),
196
+
197
+ # ── administration categories ──
198
+ "principal": (
199
+ "The principal is the head of the institution. "
200
+ "The director, chairman and governing body manage top-level college administration."
201
+ ),
202
+ "accreditation": (
203
+ "The college has NBA, NAAC, NIRF rankings and AICTE GTU affiliation. "
204
+ "Accreditation grade, approval score and institutional ranking details."
205
+ ),
206
+ "awards": (
207
+ "The college has received awards and recognition at national and state level. "
208
+ "Students and faculty have achieved distinctions, trophies and rank honors."
209
+ ),
210
+ },
211
+
212
+ # ─────────────────────────────────────────────────
213
+ # TOPIC — aspects of a subject. Written as distinct
214
+ # question fragments to separate overlapping terms.
215
+ # ─────────────────────────────────────────────────
216
+ "topic": {
217
+ "faculty": (
218
+ "Who are the faculty members? I want to know about professors, lecturers, "
219
+ "HOD designation, teaching staff qualification and PhD details."
220
+ ),
221
+ "lab": (
222
+ "Where is the laboratory? I want to know about practical experiments, "
223
+ "equipment, instruments, workshop setup and apparatus in the lab."
224
+ ),
225
+ "syllabus": (
226
+ "What is the course syllabus? Show me the curriculum, subjects, units, "
227
+ "chapters and semester-wise GTU prescribed course content."
228
+ ),
229
+ "timetable": (
230
+ "What is the class timetable? I need the lecture schedule, period slots, "
231
+ "weekly routine and batch division timing."
232
+ ),
233
+ "event": (
234
+ "What events are coming up? Tell me about the fest, hackathon, seminar, "
235
+ "workshop, cultural program or technical competition organized."
236
+ ),
237
+ "project": (
238
+ "Tell me about student projects. I want to know about final year projects, "
239
+ "mini projects, SIH capstone work and dissertation submissions."
240
+ ),
241
+ "virtual_tour": (
242
+ "Can I take a virtual tour of the campus? "
243
+ "Show me the 360-degree online walkthrough, interactive map of rooms and infrastructure."
244
+ ),
245
+ "notice": (
246
+ "Are there any new notices? Show me the latest announcements, circulars, "
247
+ "bulletin board updates and recent notifications."
248
+ ),
249
+ "fees": (
250
+ "What are the fees? I want the tuition fee structure, semester charges, "
251
+ "fine, late fee, scholarship and payment due breakdown."
252
+ ),
253
+ "rules": (
254
+ "What are the rules? Tell me about college regulations, discipline policy, "
255
+ "code of conduct, dress code and behaviour guidelines."
256
+ ),
257
+ "facilities": (
258
+ "What facilities are available? Tell me about amenities like WiFi, mess, canteen, "
259
+ "gym, recreation areas and other campus services provided."
260
+ ),
261
+ "contact": (
262
+ "How do I contact them? I need the phone number, email address, "
263
+ "office location and person to reach at the department."
264
+ ),
265
+ "process": (
266
+ "What is the process to apply? Tell me the step-by-step procedure, "
267
+ "method, eligibility requirement and workflow sequence."
268
+ ),
269
+ "document": (
270
+ "What documents do I need? I need a bonafide certificate, migration form, "
271
+ "leaving certificate, TC or official attestation and verification."
272
+ ),
273
+ "route": (
274
+ "What is the bus route? Tell me the bus stop, pickup and drop point, "
275
+ "commute map, schedule and destination timing."
276
+ ),
277
+ "stats": (
278
+ "What are the statistics? Show me data, figures, total numbers, pass percentage, "
279
+ "ratio, achievements and placement records."
280
+ ),
281
+ "calendar": (
282
+ "What does the academic calendar look like? Show me exam dates, holidays, "
283
+ "semester start and end, and important event deadlines."
284
+ ),
285
+ "vision": (
286
+ "What is the vision and mission of the college? "
287
+ "Tell me the goals, objectives, values, motto and philosophy statement."
288
+ ),
289
+ "induction": (
290
+ "When is the induction program? Tell me about the orientation, welcome program, "
291
+ "freshman schedule, speaker list and new student activities."
292
+ ),
293
+ },
294
+
295
+ # ─────────────────────────────────────────────────
296
+ # INTENT — semantically distant action patterns.
297
+ # Use strong, distinct phrasing to avoid overlap.
298
+ # ─────────────────────────────────────────────────
299
+ "intent": {
300
+ "list": (
301
+ "List all available options. Show me every item. "
302
+ "Give me a complete enumeration. Display all choices."
303
+ ),
304
+ "count": (
305
+ "How many are there? What is the total count? "
306
+ "Tell me the number, quantity and strength."
307
+ ),
308
+ "detail": (
309
+ "What is this? Explain it to me. Describe and tell me about it. "
310
+ "I want an overview, summary and background information."
311
+ ),
312
+ "process": (
313
+ "How do I do this? What are the steps? "
314
+ "Guide me through the procedure, method and eligibility requirements."
315
+ ),
316
+ "greeting": (
317
+ "Hello! Hi, good morning, good evening. "
318
+ "How are you? Namaste. Hey, greetings to you."
319
+ ),
320
+ },
321
+ }
322
+ ABBREVIATIONS = {
323
+ # Departments
324
+ "ce": "computer civil engineering",
325
+ "cse": "computer science engineering",
326
+ "ds": "data science",
327
+ "it": "information technology",
328
+ "ict": "information communication technology",
329
+ "ece": "electronics communication engineering",
330
+ "ei": "electronics instrumentation engineering",
331
+ "ic": "instrumentation control",
332
+ "ee": "electrical engineering",
333
+ "pe": "power electronics",
334
+ "me": "mechanical master engineering",
335
+ "am": "applied mechanics",
336
+ "che": "chemical engineering",
337
+ "ch": "chemical",
338
+
339
+ # Degrees & Admission
340
+ "ug": "undergraduate",
341
+ "pg": "postgraduate",
342
+ "be": "bachelor engineering",
343
+ "btech": "bachelor technology",
344
+ "mtech": "master technology",
345
+ "mba": "master business administration",
346
+ "mca": "master computer applications",
347
+
348
+ # Organizations & Cells
349
+ "tnp": "training placement",
350
+ "nss": "national service scheme",
351
+ "ncc": "national cadet corps",
352
+ "ieee": "institute electrical electronics engineers",
353
+ "iei": "institution engineers india",
354
+ "wdc": "women development cell",
355
+ "sip": "student innovation project",
356
+ "gtu": "gujarat technological university",
357
+ "nba": "national board accreditation",
358
+ "naac": "national assessment accreditation council",
359
+ "nirf": "national institutional ranking framework",
360
+ "aicte": "all india council technical education",
361
+ "drdo": "defence research development organisation",
362
+ "isro": "indian space research organisation",
363
+ "dst": "department science technology",
364
+
365
+ # General
366
+ "sih": "smart india hackathon",
367
+ "lpa": "lakhs per annum",
368
+ "noc": "no objection certificate",
369
+ "tc": "transfer certificate",
370
+ "hod": "head department",
371
+ "phd": "doctor philosophy",
372
+
373
+ # Technical
374
+ "scada": "supervisory control data acquisition",
375
+ "plc": "programmable logic controller",
376
+ "lvdt": "linear variable differential transformer",
377
+ "mosfet": "metal oxide semiconductor field effect transistor",
378
+ "igbt": "insulated gate bipolar transistor",
379
+ "vlsi": "very large scale integration",
380
+ "cad": "computer aided design",
381
+ "cam": "computer aided manufacturing",
382
+ "erp": "enterprise resource planning",
383
+ "rf": "radio frequency",
384
+ "nlp": "natural language processing",
385
+ "ai": "artificial intelligence",
386
+ "ml": "machine learning",
387
+ "ip": "intellectual property",
388
+ }
389
+ MASTER_INDEX = {
390
+ "department": {
391
+ "categories": [
392
+ "applied_mechanics", "chemical", "civil", "computer", "cse_ds",
393
+ "electronics_comm", "electronics_inst", "electrical", "it", "ict",
394
+ "instrumentation", "mechanical", "power_electronics", "science_humanities"
395
+ ],
396
+ "topics": [
397
+ "faculty", "lab", "syllabus", "timetable", "event",
398
+ "project", "virtual_tour", "notice", "contact"
399
+ ]
400
+ },
401
+ "facility": {"categories": [None], "topics": ["lab", "facilities", "contact", "virtual_tour"]},
402
+ "service": {
403
+ "categories": ["transport", "finance", "medical", "sports", "grievance", "forms", "principal", "accreditation", "awards"],
404
+ "topics": ["fees", "rules", "process", "document", "route", "contact", "calendar", "stats", "vision", "induction"]
405
+ },
406
+ "hostel": {"categories": [None], "topics": ["fees", "rules", "facilities", "contact"]},
407
+ "library": {"categories": [None], "topics": ["fees", "rules", "facilities", "contact", "calendar", "document"]},
408
+ "placement": {"categories": ["tnp"], "topics": ["stats", "process", "contact", "event"]},
409
+ "research": {"categories": ["patent", "ssip", "funded", "publication"], "topics": ["stats", "project", "process", "contact"]},
410
+ "club": {"categories": ["nss", "ncc", "ieee", "iei", "adventure", "women_cell"], "topics": ["event", "contact", "process", "stats", "notice"]},
411
+ "admission": {"categories": ["ug", "pg"], "topics": ["process", "fees", "document", "result", "calendar", "contact"]}
412
+ }
413
+
414
+ from sentence_transformers import SentenceTransformer
415
+ from sklearn.metrics.pairwise import cosine_similarity
416
+ import numpy as np
417
+ import re
418
+ from app.core.config import settings
419
+ from typing import Optional
420
+ from rank_bm25 import BM25Okapi
421
+
422
+ FIELD_THRESHOLDS = {
423
+ "type": 0.25, # Was 0.5 - too high for embedding-heavy field
424
+ "category": 0.5, # Keep - BM25-heavy works well
425
+ "topic": 0.4, # Was 0.5 - slight reduction
426
+ "intent": 0.5, # Keep - usually clear signals
427
+ }
428
+
429
+
430
+ FIELD_WEIGHTS = {
431
+ "type": (0.6, 0.4), # embedding-heavy — semantic
432
+ "category": (0.35, 0.65), # BM25-heavy — exact names matter most
433
+ "topic": (0.55, 0.45),
434
+ "intent": (0.7, 0.3), # embedding-heavy — semantic intent
435
+ }
436
+
437
+ class FilterClassifier:
438
+ def __init__(self, threshold=None):-==–
439
+ self.model = SentenceTransformer(str(settings.embeddings_path / "bge-small"))
440
+ self.anchor_embeddings = {}
441
+ self.bm25_classifiers = {}
442
+ self.anchor_keys = {}
443
+ self._build_anchor_embeddings()
444
+ self._build_bm25()
445
+ self.threshold = threshold
446
+
447
+ def _build_anchor_embeddings(self):
448
+ for field, anchors in ANCHORS.items():
449
+ self.anchor_embeddings[field] = {
450
+ label: self.model.encode(f"{label} {text}")
451
+ for label, text in anchors.items()
452
+ }
453
+
454
+ def _build_bm25(self):
455
+ for field, anchors in ANCHORS.items():
456
+ keys = list(anchors.keys())
457
+ docs = [f"{label} {text}".lower().split() for label, text in anchors.items()]
458
+ self.anchor_keys[field] = keys
459
+ self.bm25_classifiers[field] = BM25Okapi(docs)
460
+
461
+ def handle_abbreviations(self, query: str) -> str:
462
+ tokens = query.lower().split()
463
+ expanded = [ABBREVIATIONS.get(t, t) for t in tokens]
464
+ return " ".join(expanded)
465
+
466
+ def preprocess(self, query: str) -> str:
467
+ # Remove punctuation except spaces
468
+ query = re.sub(r'[^\w\s]', ' ', query.lower())
469
+ # Handle multiple spaces
470
+ query = re.sub(r'\s+', ' ', query).strip()
471
+ return query
472
+
473
+ def classify(self, query: str) -> dict:
474
+ query = self.handle_abbreviations(query)
475
+ query = self.preprocess(query)
476
+ query_emb = self.model.encode(query)
477
+ tokenized = query.lower().split()
478
+ result = {"type": None, "category": None, "topic": None, "year": None, "intent": None}
479
+
480
+ for field, value_embeddings in self.anchor_embeddings.items():
481
+ keys = self.anchor_keys[field]
482
+
483
+ # Embedding scores
484
+ emb_scores = {
485
+ val: cosine_similarity([query_emb], [emb])[0][0]
486
+ for val, emb in value_embeddings.items()
487
+ }
488
+
489
+ # BM25 scores — normalized to [0, 1]
490
+ raw_bm25 = self.bm25_classifiers[field].get_scores(tokenized)
491
+ bm25_max = max(raw_bm25) if max(raw_bm25) > 0 else 1
492
+ bm25_scores = {
493
+ keys[i]: raw_bm25[i] / bm25_max
494
+ for i in range(len(keys))
495
+ }
496
+
497
+ # Combine
498
+ emb_w, bm25_w = FIELD_WEIGHTS[field]
499
+ combined = {
500
+ val: emb_w * emb_scores[val] + bm25_w * bm25_scores.get(val, 0)
501
+ for val in keys
502
+ }
503
+
504
+ best_val = max(combined, key=combined.get)
505
+ best_score = combined[best_val]
506
+ threshold = FIELD_THRESHOLDS[field]
507
+
508
+ print(f"{field}: {best_val} | combined={best_score:.3f} emb={emb_scores[best_val]:.3f} bm25={bm25_scores.get(best_val,0):.3f}")
509
+
510
+ if best_score > threshold:
511
+ result[field] = best_val
512
+
513
+ year = re.search(r"\b(20\d{2})\b", query)
514
+ result["year"] = int(year.group()) if year else None
515
+
516
+ if result["type"] is not None:
517
+ if (result["category"] is None
518
+ or result["category"] not in MASTER_INDEX[result["type"]]["categories"]):
519
+ result["category"] = None
520
+ if (result["topic"] is None
521
+ or result["topic"] not in MASTER_INDEX[result["type"]]["topics"]):
522
+ result["topic"] = None
523
+ else:
524
+ result["category"] = None
525
+ result["topic"] = None
526
+
527
+ return result
528
+
529
+ classifier = FilterClassifier()
app/services/hybrid_retrieval.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hybrid Retrieval Service
3
+ ========================
4
+ Combines BM25 (keyword) and vector (semantic) retrieval using
5
+ Reciprocal Rank Fusion (RRF) for stable, well-tested score merging.
6
+
7
+ Strategy:
8
+ 1. Run vector similarity search → fetches top-k candidates from ChromaDB.
9
+ 2. Those same candidate documents become the BM25 corpus (no second DB call).
10
+ 3. Fuse both ranked lists using RRF.
11
+ 4. Apply an optional title-match boost post-fusion.
12
+ 5. Return the top-k results.
13
+
14
+ Why RRF instead of EnsembleRetriever?
15
+ - EnsembleRetriever depends on langchain_classic which is unstable.
16
+ - RRF is score-agnostic: it only uses rank order, so you never need to
17
+ normalise BM25 scores against cosine distances.
18
+ - It's the standard fusion method in production hybrid search systems
19
+ (used by Elasticsearch, Cohere, etc.).
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ from dataclasses import dataclass, field
25
+ from typing import List, Optional, Dict
26
+
27
+ from langchain_community.retrievers import BM25Retriever
28
+ from langchain_core.documents import Document
29
+
30
+ from app.utils.preprocessing import preprocess
31
+ from app.services.classifier_service import clf
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Configuration
36
+ # ---------------------------------------------------------------------------
37
+
38
+ @dataclass
39
+ class HybridRetrievalConfig:
40
+ """Centralised configuration for the hybrid retrieval pipeline."""
41
+
42
+ # Number of candidates each sub-retriever fetches before fusion
43
+ candidate_k: int = 15
44
+
45
+ # Final number of documents returned after fusion + reranking
46
+ top_k: int = 5
47
+
48
+ # Weights for fused score: bm25_weight + vector_weight should equal 1.0
49
+ bm25_weight: float = 0.6
50
+ vector_weight: float = 0.4
51
+
52
+ # RRF constant – larger k smooths rank differences (standard default: 60)
53
+ rrf_k: int = 60
54
+
55
+ # BM25 hyperparameters
56
+ bm25_k1: float = 1.5 # term frequency saturation
57
+ bm25_b: float = 0.5 # length normalisation
58
+
59
+ # Title-match boost: added to fused score for each query word found in title
60
+ title_boost_per_word: float = 0.1
61
+
62
+ # Minimum fused score to include a result (set to 0.0 to disable)
63
+ score_threshold: float = 0.0
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Result type
68
+ # ---------------------------------------------------------------------------
69
+
70
+ @dataclass
71
+ class RetrievalResult:
72
+ """A single retrieved document with provenance scores."""
73
+ document: Document
74
+ fused_score: float
75
+ bm25_rank: Optional[int] = None # rank in BM25 list (1-indexed), None if absent
76
+ vector_rank: Optional[int] = None # rank in vector list (1-indexed), None if absent
77
+ title_boost: float = 0.0
78
+
79
+
80
+ # ---------------------------------------------------------------------------
81
+ # Core service
82
+ # ---------------------------------------------------------------------------
83
+
84
+ class HybridRetrievalService:
85
+ """
86
+ Hybrid retrieval that fuses BM25 and vector search results via RRF.
87
+
88
+ Usage
89
+ -----
90
+ ::
91
+
92
+ service = HybridRetrievalService(vector_db=rag.db)
93
+ results = service.retrieve(query="Faculties of Computer Department")
94
+ for r in results:
95
+ print(r.fused_score, r.document.page_content[:80])
96
+ """
97
+
98
+ def __init__(
99
+ self,
100
+ vector_db,
101
+ config: Optional[HybridRetrievalConfig] = None,
102
+ ):
103
+ """
104
+ Parameters
105
+ ----------
106
+ vector_db:
107
+ A LangChain-compatible vector store (e.g., Chroma instance from
108
+ ``RAGService.db``) that supports ``similarity_search_with_score``.
109
+ config:
110
+ Optional configuration object. Defaults to ``HybridRetrievalConfig()``.
111
+ """
112
+ self.vector_db = vector_db
113
+ self.cfg = config or HybridRetrievalConfig()
114
+ self.classifier = clf
115
+ self.raw_filters = {}
116
+
117
+ # ------------------------------------------------------------------
118
+ # Public API
119
+ # ------------------------------------------------------------------
120
+
121
+ def retrieve(
122
+ self,
123
+ query: str,
124
+ ) -> List[RetrievalResult]:
125
+ """
126
+ Run hybrid retrieval and return ranked results.
127
+
128
+ Vector search runs once to fetch the candidate pool from ChromaDB.
129
+ Those documents are immediately reused as the BM25 corpus, so there
130
+ is no redundant database call.
131
+
132
+ Parameters
133
+ ----------
134
+ query:
135
+ The raw user query (preprocessing is applied internally).
136
+
137
+ Returns
138
+ -------
139
+ List[RetrievalResult]
140
+ Top-k results sorted by descending fused score.
141
+ """
142
+ processed_query = self.classifier.expand_abbreviations(query)
143
+ print("Processed Query: ", processed_query)
144
+
145
+ # Step 1: Single vector search — produces both the ranking AND the candidate pool
146
+ vector_ranking = self._vector_rank(processed_query)
147
+
148
+ if not vector_ranking:
149
+ return []
150
+
151
+ # Step 2: Extract the candidate docs from the vector results for BM25
152
+ candidate_docs = []
153
+ for doc, _score, _rank in vector_ranking:
154
+ doc.metadata["original_content"] = doc.page_content
155
+ doc.page_content = preprocess(doc.page_content)
156
+ doc.page_content = doc.metadata['title'] + ": " + doc.page_content
157
+ candidate_docs.append(doc)
158
+
159
+ # Step 3: BM25 search over the same candidate pool (no extra DB call)
160
+ bm25_ranking = self._bm25_rank(processed_query, candidate_docs)
161
+
162
+ # Step 4: Fuse both rankings via RRF
163
+ fused = self._reciprocal_rank_fusion(bm25_ranking, vector_ranking)
164
+
165
+
166
+ filter_boost = {
167
+ "type": 1.10,
168
+ "category": 1.20,
169
+ "topic": 1.20,
170
+ "intent": 1.05
171
+ }
172
+
173
+ # Step 5: Boost scores based on filter confidence
174
+ for result in fused:
175
+ doc = result.document
176
+
177
+ for field in ["type", "category", "topic", "intent"]:
178
+ if field in doc.metadata:
179
+ # Check if classifier was confident AND matched
180
+ val = self.raw_filters.get(field)
181
+ conf = self.raw_filters.get(f"{field}_conf", 0)
182
+
183
+ # if val is not None and doc.metadata[field] == val and conf > 0.92 and field == "category":
184
+ # result.fused_score *= 1.25
185
+ if val is not None and doc.metadata[field] == val and conf > 0.90 and field != "intent":
186
+ result.fused_score *= filter_boost[field]
187
+ elif val is not None and doc.metadata[field] == val and conf > 0.7:
188
+ result.fused_score *= 1.05
189
+
190
+ boosted = self._apply_title_boost(fused, processed_query)
191
+
192
+ # Step 5: Filter, sort, and return top-k
193
+ for r in boosted:
194
+ r.fused_score = r.fused_score * 10
195
+ r.document.page_content = r.document.metadata["original_content"]
196
+ r.document.metadata["original_content"] = ""
197
+ results = [r for r in boosted if r.fused_score >= self.cfg.score_threshold]
198
+ results = sorted(results, key=lambda r: r.fused_score, reverse=True)
199
+ return results[: self.cfg.top_k]
200
+
201
+ # ------------------------------------------------------------------
202
+ # Private helpers
203
+ # ------------------------------------------------------------------
204
+
205
+ def _bm25_rank(
206
+ self,
207
+ processed_query: str,
208
+ candidate_docs: List[Document],
209
+ ) -> List[tuple[Document, float, int]]:
210
+ """
211
+ Run BM25 over the candidate pool.
212
+
213
+ Returns a list of (document, raw_bm25_score, rank) tuples,
214
+ ordered by descending score (rank is 1-indexed).
215
+ """
216
+ retriever = BM25Retriever.from_documents(
217
+ candidate_docs,
218
+ bm25_params={"k1": self.cfg.bm25_k1, "b": self.cfg.bm25_b},
219
+ )
220
+
221
+ tokens = retriever.preprocess_func(processed_query)
222
+ raw_scores = retriever.vectorizer.get_scores(tokens)
223
+
224
+ # Pair each document with its BM25 score and sort descending
225
+ scored = sorted(
226
+ zip(retriever.docs, raw_scores),
227
+ key=lambda x: x[1],
228
+ reverse=True,
229
+ )
230
+
231
+ return [(doc, score, rank + 1) for rank, (doc, score) in enumerate(scored)]
232
+
233
+
234
+ def _vector_rank(
235
+ self,
236
+ processed_query: str,
237
+ ) -> List[tuple[Document, float, int]]:
238
+ """
239
+ Run vector similarity search against ChromaDB.
240
+
241
+ Returns a list of (document, similarity_score, rank) tuples,
242
+ ordered by descending similarity (rank is 1-indexed).
243
+ Chroma returns (document, distance); we convert to similarity = 1 - distance.
244
+ """
245
+ # SCORE_FALLBACK_THRESHOLD = 0.4
246
+ self.raw_filters = {}
247
+ filters = self.classifier.predict_with_filter([processed_query])
248
+ raw_filters = self.classifier.predict([processed_query])[0]
249
+ self.raw_filters = raw_filters
250
+
251
+ if filters:
252
+ raw_results = self.vector_db.similarity_search_with_score(
253
+ processed_query, k=self.cfg.candidate_k, filter=filters
254
+ )
255
+ # best_score = (1 - raw_results[0][1]) if raw_results else 0
256
+ # if not raw_results or best_score < SCORE_FALLBACK_THRESHOLD:
257
+ if not raw_results:
258
+ # print("FAILED UNDER THRESHOLD")
259
+ # print("*="*50)
260
+ # print("Query: ", processed_query)
261
+ # print("Filters: ", filters)
262
+ # print("Raw Results: ", raw_results)
263
+ # print("Best Score: ", best_score)
264
+ # print("*="*50)
265
+
266
+ raw_results = self.vector_db.similarity_search_with_score(
267
+ processed_query, k=self.cfg.candidate_k
268
+ )
269
+ else:
270
+ raw_results = self.vector_db.similarity_search_with_score(
271
+ processed_query, k=self.cfg.candidate_k
272
+ )
273
+
274
+
275
+ ranked = []
276
+ for rank, (doc, distance) in enumerate(raw_results):
277
+ similarity = 1.0 - distance
278
+ ranked.append((doc, similarity, rank + 1))
279
+
280
+ return ranked
281
+
282
+ def _reciprocal_rank_fusion(
283
+ self,
284
+ bm25_ranking: List[tuple[Document, float, int]],
285
+ vector_ranking: List[tuple[Document, float, int]],
286
+ ) -> List[RetrievalResult]:
287
+ """
288
+ Fuse two ranked lists using Reciprocal Rank Fusion (RRF).
289
+
290
+ RRF score for a document d:
291
+ score(d) = w_bm25 * 1/(k + rank_bm25(d))
292
+ + w_vec * 1/(k + rank_vec(d))
293
+
294
+ Documents not present in a list are simply omitted from that term.
295
+ We use page_content as the deduplication key (consistent with how
296
+ BM25Retriever stores docs).
297
+ """
298
+ rrf_k = self.cfg.rrf_k
299
+
300
+ # Build lookup: content_key -> RetrievalResult
301
+ fused: Dict[str, RetrievalResult] = {}
302
+
303
+ def content_key(doc: Document) -> str:
304
+ # Use a short hash of content for stable keying
305
+ return doc.page_content
306
+
307
+ # --- BM25 contribution ---
308
+ for doc, _raw_score, rank in bm25_ranking:
309
+ key = content_key(doc)
310
+ contribution = self.cfg.bm25_weight * (1.0 / (rrf_k + rank))
311
+ if key not in fused:
312
+ fused[key] = RetrievalResult(document=doc, fused_score=0.0)
313
+ fused[key].fused_score += contribution
314
+ fused[key].bm25_rank = rank
315
+
316
+ # --- Vector contribution ---
317
+ for doc, _similarity, rank in vector_ranking:
318
+ key = content_key(doc)
319
+ contribution = self.cfg.vector_weight * (1.0 / (rrf_k + rank))
320
+ if key not in fused:
321
+ fused[key] = RetrievalResult(document=doc, fused_score=0.0)
322
+ fused[key].fused_score += contribution
323
+ fused[key].vector_rank = rank
324
+
325
+ return list(fused.values())
326
+
327
+ def _apply_title_boost(
328
+ self,
329
+ results: List[RetrievalResult],
330
+ processed_query: str,
331
+ ) -> List[RetrievalResult]:
332
+ """
333
+ Boost fused score for documents whose title contains query words.
334
+
335
+ Each matching word adds ``cfg.title_boost_per_word`` to the score.
336
+ This is a lightweight, interpretable re-ranking step that rewards
337
+ exact title hits without overriding semantic relevance entirely.
338
+ """
339
+ query_words = set(processed_query.lower().split())
340
+
341
+ for result in results:
342
+ title = preprocess(result.document.metadata.get("title", "").lower())
343
+ if not title:
344
+ continue
345
+
346
+ boost = sum(
347
+ self.cfg.title_boost_per_word
348
+ for word in query_words
349
+ if word and word in title
350
+ )
351
+ result.title_boost = boost
352
+ result.fused_score += boost
353
+
354
+ return results
app/services/ingestion_service.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from app.services.file_service import file_service
3
+ from app.services.text_splitter import TextSplitter
4
+ from langchain_core.documents import Document
5
+ from app.utils.preprocessing import normalize
6
+ import json
7
+ from typing import List
8
+ from fastapi import HTTPException
9
+
10
+ class IngestionService:
11
+ def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
12
+ self.chunk_size = chunk_size
13
+ self.chunk_overlap = chunk_overlap
14
+ self.text_splitter = TextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
15
+
16
+ def load_file(self, file_path: Path):
17
+ # read file
18
+ document = file_service.read_file(file_path)
19
+ metadata = document.metadata
20
+
21
+ #save_file
22
+ file_service.write_file(file_path, document.page_content, metadata)
23
+
24
+ #handle_file_chunks
25
+ if metadata["ext"] == "json":
26
+ return self.handle_json_docs(document, metadata)
27
+ else:
28
+ return self.handle_text_docs(document, file_path, metadata)
29
+
30
+ def ingest(self, file_path: Path):
31
+ documents = self.load_file(file_path)
32
+ return documents
33
+
34
+ def get_records(self):
35
+ return file_service.get_records()
36
+
37
+ def delete_record(self, filename: str):
38
+ return file_service.delete_record(filename)
39
+
40
+ def path_record(self, file_path: Path, metadata:dict):
41
+ file_service.patch_metadata(file_path, metadata)
42
+
43
+ def handle_json_docs(self, document: Document, metadata: dict) -> List[Document]:
44
+ docs = []
45
+ json_data = json.loads(document.page_content)
46
+ count = 0
47
+ # content: { key: { list: [], detail: text }, key: { list: [], detail: text } }
48
+ for key,value in json_data.items():
49
+ for intent, intent_content in value.items():
50
+ if intent == "list":
51
+ chunk = ""
52
+ for idx, item in enumerate(intent_content or []):
53
+ if(item.strip() == ""):
54
+ continue
55
+ chunk += f"{idx+1}. {item.strip()}\n"
56
+ if(chunk):
57
+ docs.append(Document(page_content=chunk, metadata={**metadata, "topic": key, "intent": intent, "chunk_index": count}))
58
+ count += 1
59
+ if(len(intent_content) > 0):
60
+ docs.append(Document(page_content=f"Total {key}: {len(intent_content)}", metadata={**metadata, "topic": key, "intent": "count", "chunk_index": count}))
61
+ count += 1
62
+
63
+ elif intent == "detail" or intent == "details":
64
+ if(intent_content.strip() == ""):
65
+ continue
66
+ chunk = f"{intent_content.strip()}"
67
+ docs.append(Document(page_content=chunk, metadata={**metadata, "topic": key, "intent": "detail", "chunk_index": count}))
68
+ count += 1
69
+ return docs
70
+
71
+ def handle_text_docs(self, document: Document, file_path: Path, metadata: dict) -> List[Document]:
72
+ docs = []
73
+ # split document into chunks
74
+ documents = self.text_splitter.split_documents([document])
75
+
76
+ # create preprocess document texts
77
+ for idx, doc in enumerate(documents):
78
+ # Create a copy to avoid modifying original during iteration
79
+ new_doc = Document(
80
+ page_content=normalize(doc.page_content),
81
+ metadata={
82
+ **doc.metadata,
83
+ **metadata,
84
+ "source": file_path.name,
85
+ "chunk_index": idx
86
+ }
87
+ )
88
+
89
+ # Check normalized content has actual text
90
+ if len(new_doc.page_content.strip()) > 0:
91
+ docs.append(new_doc)
92
+
93
+ return docs # Return the processed list, not final_docs
94
+
95
+ ingestion_service = IngestionService()
app/services/rag_service.py ADDED
@@ -0,0 +1,483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import time
3
+
4
+ # LangChain Imports
5
+ from fastapi import HTTPException
6
+ from langchain_core.prompts import PromptTemplate
7
+ from langchain_chroma import Chroma
8
+ from langchain.messages import HumanMessage, AIMessage, SystemMessage
9
+ from typing import Optional, List
10
+ from app.api.schemas.tests import TestRequestSchema, TestClassifierReqSchema
11
+ from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_score, recall_score
12
+
13
+ # Locals
14
+ from app.services.text_splitter import TextSplitter
15
+ from app.services.vector_store import VectorStore
16
+ from app.utils.preprocessing import normalize, preprocess_documents, preprocess_query
17
+ from app.utils.document_helpers import get_references, create_documents, build_metadata, get_references_v2
18
+ from app.prompts import SYSTEM_PROMPT, wrap_exaone
19
+ from app.core.config import settings
20
+ from app.services.hybrid_retrieval import HybridRetrievalService, HybridRetrievalConfig
21
+ from app.services.ingestion_service import IngestionService
22
+ from app.services.classifier_service import clf
23
+
24
+ def format_history(history: list[str]) -> str:
25
+ formatted = []
26
+ for i, msg in enumerate(history):
27
+ role = "User" if i % 2 == 0 else "Assistant"
28
+ formatted.append(f"{role}: {msg}")
29
+ return "\n".join(formatted)
30
+
31
+
32
+ class RAGService:
33
+ def __init__(self,
34
+ model,
35
+ collection_name: str = None,
36
+ persist_directory: str = None,
37
+ embedding_model = None,
38
+ k: int = None):
39
+
40
+ # initialize Models
41
+ self.model = model
42
+ self.embedding_model = embedding_model
43
+ self.collection_name = collection_name or settings.collection_name
44
+ self.k = k or settings.similarity_top_k
45
+ self.persist_directory = persist_directory or settings.persist_directory
46
+ self.evaluation = {}
47
+
48
+ #setup vector database
49
+ self.db = Chroma(
50
+ collection_name=self.collection_name,
51
+ embedding_function=self.embedding_model,
52
+ persist_directory=self.persist_directory
53
+ )
54
+
55
+ self.database = VectorStore(self.db)
56
+ self.text_splitter = TextSplitter()
57
+
58
+ # document_loader
59
+ # self.doc_loader = document_loader(filepath=self.filepath)
60
+
61
+ self.template = PromptTemplate.from_template(SYSTEM_PROMPT)
62
+ self.retriever = self.db.as_retriever(search_type="similarity", search_kwargs={"k": self.k})
63
+
64
+
65
+ def get_filenames(self):
66
+ ingestion_service = IngestionService()
67
+ return ingestion_service.get_records()
68
+
69
+ def ingest_documents(self, filepath: str, chunk_size: int = None, chunk_overlap: int = None):
70
+ start = time.time()
71
+ path = Path(filepath)
72
+ if not path.exists():
73
+ raise FileNotFoundError(f"File not found: {filepath}")
74
+
75
+ chunk_size = chunk_size or settings.chunk_size
76
+ chunk_overlap = chunk_overlap or settings.chunk_overlap
77
+ ingestion_service = IngestionService(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
78
+ chunks = ingestion_service.ingest(path)
79
+ print("Chunks: ", chunks)
80
+
81
+ if chunks is None or len(chunks) == 0:
82
+ raise HTTPException(
83
+ status_code=400,
84
+ detail=f"No text content found in '{path.name}'. The file may be image-based or empty."
85
+ )
86
+
87
+ if len(chunks) == 1 and chunks[0].page_content.strip() == "":
88
+ raise HTTPException(
89
+ status_code=400,
90
+ detail=f"Document '{path.name}' contains empty or unreadable content."
91
+ )
92
+
93
+ size_bytes = path.stat().st_size
94
+ size_mb = (size_bytes / (1024 * 1024))
95
+ # Add to database
96
+ self.database.add_documents(chunks)
97
+
98
+ dim_bytes_with_chunks = (768 * 4) + chunk_size
99
+ dim_bytes = dim_bytes_with_chunks * len(chunks)
100
+ Estimated_DB_MB = dim_bytes / (1024 * 1024)
101
+
102
+ ingestion_service.path_record(file_path=path, metadata={
103
+ "doc_chunks": len(chunks),
104
+ "chunk_size": chunk_size,
105
+ "chunk_overlap": chunk_overlap,
106
+ "execution_time": time.time() - start,
107
+ "file_size": size_mb,
108
+ "db_size": Estimated_DB_MB
109
+ })
110
+
111
+ print(f"✅ Added {len(chunks)} chunks from {path.name} to vector store")
112
+ return chunks
113
+
114
+
115
+ def query(
116
+ self,
117
+ question: str,
118
+ history: List[str] = None,
119
+ k: int = None,
120
+ threshold: float = None,
121
+ include_llm_response: bool = True
122
+ ):
123
+ """
124
+ Unified search method with optional LLM response and threshold filtering.
125
+
126
+ Args:
127
+ question: User's question
128
+ history: Conversation history (optional)
129
+ k: Number of documents to retrieve (defaults to settings)
130
+ threshold: Similarity threshold filter (defaults to settings)
131
+ include_llm_response: Whether to generate LLM answer (default: True)
132
+
133
+ Returns:
134
+ dict with 'answer' (if include_llm_response), 'references', 'context'
135
+ """
136
+ if not question or len(question.strip()) == 0:
137
+ return {
138
+ "answer": "Please provide a valid question",
139
+ "references": [],
140
+ "context": ""
141
+ }
142
+
143
+ # Use defaults from settings
144
+ k = k or self.k or settings.similarity_top_k
145
+ threshold = threshold if threshold is not None else settings.similarity_threshold
146
+ history = history or []
147
+
148
+ # Normalize and search
149
+ question = preprocess_query(question)
150
+ docs = self.database.similarity_search_with_score(query=question, k=k)
151
+
152
+ # print(question)
153
+
154
+ # Get references and context
155
+ ctx = get_references(docs, threshold=threshold)
156
+ documents = ctx.get('documents', [])
157
+ context = ctx.get('context', '')
158
+ # Filter by threshold
159
+ filtered_docs = documents
160
+
161
+ # If no docs pass threshold
162
+ if not filtered_docs:
163
+ return {
164
+ "answer": "No relevant documents found matching the similarity threshold",
165
+ "references": [],
166
+ "context": "",
167
+ "threshold_used": threshold
168
+ }
169
+
170
+ # Generate LLM response if requested
171
+ if include_llm_response:
172
+ formatted_history = format_history(history)
173
+ prompt = self.template.invoke({
174
+ "history": formatted_history,
175
+ "question": question,
176
+ "context": context
177
+ })
178
+ response = self.model.invoke(prompt)
179
+ answer = response.content
180
+ else:
181
+ answer = "" # Just return context without LLM
182
+ context = ""
183
+
184
+ return {
185
+ "answer": answer,
186
+ "references": filtered_docs,
187
+ "context": context,
188
+ "threshold_used": threshold,
189
+ "k_used": k
190
+ }
191
+
192
+ def hybrid_query(
193
+ self,
194
+ question: str,
195
+ history: List[str] = None,
196
+ k: int = None,
197
+ threshold: float = None,
198
+ include_llm_response: bool = True
199
+ ):
200
+ """
201
+ Unified search method with optional LLM response and threshold filtering.
202
+
203
+ Args:
204
+ question: User's question
205
+ history: Conversation history (optional)
206
+ k: Number of documents to retrieve (defaults to settings)
207
+ threshold: Similarity threshold filter (defaults to settings)
208
+ include_llm_response: Whether to generate LLM answer (default: True)
209
+
210
+ Returns:
211
+ dict with 'answer' (if include_llm_response), 'references', 'context'
212
+ """
213
+ if not question or len(question.strip()) == 0:
214
+ return {
215
+ "answer": "Please provide a valid question",
216
+ "references": [],
217
+ "context": ""
218
+ }
219
+
220
+ # Use defaults from settings
221
+ threshold = threshold if threshold is not None else settings.similarity_threshold
222
+ history = history or []
223
+
224
+ query = question
225
+ candidate_k = 15 # how many docs vector search fetches (also the BM25 pool size)
226
+ final_k = k or settings.similarity_top_k # how many results to return after fusion
227
+
228
+ config = HybridRetrievalConfig(
229
+ candidate_k=candidate_k,
230
+ top_k=final_k,
231
+ bm25_weight=0.45,
232
+ vector_weight=0.55,
233
+ rrf_k=20,
234
+ bm25_k1=1.2,
235
+ bm25_b=0.9,
236
+ title_boost_per_word=0.004,
237
+ score_threshold=threshold,
238
+ )
239
+
240
+ service = HybridRetrievalService(vector_db=self.db, config=config)
241
+ docs = service.retrieve(query=query)
242
+
243
+ ctx = get_references_v2(docs, threshold=threshold)
244
+ documents = ctx.get('documents', [])
245
+ context = ctx.get('context', 'No context available')
246
+ filtered_docs = documents
247
+
248
+ print("*"*50)
249
+ print("context: ", context)
250
+ print("*"*50)
251
+
252
+ if not filtered_docs:
253
+ return {
254
+ "answer": "No relevant documents found matching the similarity threshold",
255
+ "references": [],
256
+ "context": "",
257
+ "threshold_used": threshold
258
+ }
259
+
260
+ if include_llm_response:
261
+ formatted_history = format_history(history)
262
+ prompt = self.template.invoke({
263
+ "history": formatted_history,
264
+ "question": question,
265
+ "context": context
266
+ })
267
+
268
+ # if settings.local_model_name == "EXAONE-3.5-2.4B-Instruct-Q4_K_M.gguf":
269
+ # prompt = wrap_exaone(prompt)
270
+
271
+ response = self.model.invoke(prompt)
272
+ answer = response.content if hasattr(response, "content") else response
273
+
274
+ else:
275
+ answer = "" # Just return context without LLM
276
+ context = ""
277
+
278
+ return {
279
+ "answer": answer,
280
+ "references": filtered_docs,
281
+ "context": context,
282
+ "threshold_used": threshold,
283
+ "k_used": k
284
+ }
285
+
286
+ def search_docs(
287
+ self,
288
+ question: str,
289
+ k: int = 10,
290
+ filename: str = None
291
+ ):
292
+ """
293
+ Unified search method with optional LLM response and threshold filtering.
294
+
295
+ Args:
296
+ question: User's question
297
+ k: Number of documents to retrieve (defaults to settings)
298
+
299
+ Returns:
300
+ dict with 'answer' (if include_llm_response), 'references', 'context'
301
+ """
302
+ if not question or len(question.strip()) == 0:
303
+ return {
304
+ "answer": "Please provide a valid question",
305
+ "references": [],
306
+ "context": ""
307
+ }
308
+
309
+ # Use defaults from settings
310
+ threshold = settings.similarity_threshold
311
+
312
+ query = question
313
+ candidate_k = 15 # how many docs vector search fetches (also the BM25 pool size)
314
+ final_k = k or settings.similarity_top_k # how many results to return after fusion
315
+
316
+ config = HybridRetrievalConfig(
317
+ candidate_k=candidate_k,
318
+ top_k=final_k,
319
+ bm25_weight=0.7,
320
+ vector_weight=0.3,
321
+ rrf_k=20,
322
+ bm25_k1=1.5,
323
+ bm25_b=0.75,
324
+ title_boost_per_word=0.004,
325
+ score_threshold=threshold,
326
+ )
327
+
328
+ service = HybridRetrievalService(vector_db=self.db, config=config)
329
+ docs = service.retrieve(query=query)
330
+ results = []
331
+ for doc in docs:
332
+ results.append({
333
+ "id": doc.document.id,
334
+ "content": doc.document.page_content,
335
+ "metadata": doc.document.metadata,
336
+ "score": doc.fused_score
337
+ })
338
+
339
+ if(filename):
340
+ results = [doc for doc in results if doc["metadata"]["source_file"] == filename]
341
+
342
+ return results
343
+
344
+ def test_queries(self, tests: TestRequestSchema, query_delay: float = 1.0):
345
+ """
346
+ query_delay: seconds to wait between queries.
347
+ Gemini free tier allows 100 embedding RPM → safe delay = 1.0s.
348
+ For 150 queries: ~2.5 min total.
349
+ """
350
+ results = []
351
+ k = tests.k
352
+ threshold = tests.threshold
353
+
354
+ for idx, test in enumerate(tests.tests):
355
+ question = test.question
356
+ document = test.document
357
+ chunk_index = test.chunk_index
358
+
359
+ response = self.hybrid_query(
360
+ question=question,
361
+ history=[],
362
+ k=k,
363
+ threshold=threshold,
364
+ include_llm_response=False,
365
+ )
366
+
367
+ # Respect Gemini embedding rate limit (100 RPM free tier)
368
+ if query_delay > 0 and idx < len(tests.tests) - 1:
369
+ print(f"[test_queries] {idx + 1}/{len(tests.tests)} done — sleeping {query_delay}s")
370
+ time.sleep(query_delay)
371
+
372
+ # print("*"*50)
373
+ # print(response)
374
+ # print("*"*50)
375
+
376
+ ans_found = False
377
+ ans = {"tests": test}
378
+
379
+ correct_source_chunks = 0
380
+ len_all_docs = len(response.get("references", []))
381
+ rank = None
382
+
383
+ for idx, ref in enumerate(response.get("references", [])):
384
+ if ref.get("source") == document:
385
+ correct_source_chunks += 1
386
+
387
+ if (
388
+ ref.get("source") == document
389
+ and ref.get("chunk_index") == chunk_index
390
+ and rank is None
391
+ ):
392
+ ans_found = True
393
+ rank = idx + 1
394
+
395
+ wrong_source_chunks = len_all_docs - correct_source_chunks
396
+ if len_all_docs > 0:
397
+ doc_precision = correct_source_chunks / len_all_docs
398
+ doc_noise = wrong_source_chunks / len_all_docs
399
+ else:
400
+ doc_precision = 0
401
+ doc_noise = 0
402
+ doc_recall = 1 if correct_source_chunks > 0 else 0
403
+ doc_error = 1 - doc_recall
404
+
405
+ if rank is not None:
406
+ mrr = 1 / rank
407
+ else:
408
+ mrr = 0
409
+
410
+ ans["answer"] = ans_found
411
+ ans["correct_source_chunks"] = correct_source_chunks
412
+ ans["wrong_source_chunks"] = wrong_source_chunks
413
+ ans["doc_precision"] = doc_precision
414
+ ans["doc_recall"] = doc_recall
415
+ ans["doc_error"] = doc_error
416
+ ans["mrr"] = mrr
417
+ ans["top_1_hit"] = 1 if rank == 1 else 0
418
+ ans["doc_noise"] = doc_noise
419
+ results.append(ans)
420
+
421
+ avg_doc_precision = sum([r["doc_precision"] for r in results]) / len(results)
422
+ avg_doc_recall = sum([r["doc_recall"] for r in results]) / len(results)
423
+ avg_mrr = sum([r["mrr"] for r in results]) / len(results)
424
+ hit_rate = sum([1 for r in results if r["answer"]]) / len(results)
425
+ top_1_hit_rate = sum([r["top_1_hit"] for r in results]) / len(results)
426
+ avg_doc_noise = sum([r["doc_noise"] for r in results]) / len(results)
427
+ error_rate = 1 - hit_rate
428
+ avg_doc_error = sum([r["doc_error"] for r in results]) / len(results)
429
+
430
+ return {
431
+ "results": results,
432
+ "avg_doc_precision": avg_doc_precision,
433
+ "avg_doc_recall": avg_doc_recall,
434
+ "avg_mrr": avg_mrr,
435
+ "hit_rate": hit_rate,
436
+ "top_1_hit_rate": top_1_hit_rate,
437
+ "avg_doc_noise": avg_doc_noise,
438
+ "error_rate": error_rate,
439
+ "avg_doc_error": avg_doc_error
440
+ }
441
+
442
+ def test_classifier(self, tests: TestClassifierReqSchema):
443
+ queries = [test.question for test in tests.tests]
444
+ result = clf.predict(queries)
445
+
446
+ fields = ["type", "category", "topic", "intent"]
447
+ evaluation = {}
448
+
449
+ for field in fields:
450
+ y_true = [getattr(t, field) if getattr(t, field) else "general" for t in tests.tests]
451
+ y_pred = [r[field] if r[field] else "general" for r in result]
452
+
453
+ evaluation[field] = {
454
+ "accuracy": accuracy_score(y_true, y_pred),
455
+ "precision": precision_score(y_true, y_pred, average="macro", zero_division=0),
456
+ "recall": recall_score(y_true, y_pred, average="macro", zero_division=0),
457
+ "f1_macro": f1_score(y_true, y_pred, average="macro", zero_division=0),
458
+ "f1_weighted": f1_score(y_true, y_pred, average="weighted", zero_division=0),
459
+ "classification_report": classification_report(y_true, y_pred, zero_division=0, output_dict=True)
460
+ }
461
+
462
+ return {
463
+ "evaluation": evaluation,
464
+ "results": result
465
+ }
466
+
467
+ def delete_database(self):
468
+ self.database.db.delete_collection()
469
+
470
+ # to close the model on destruction
471
+ def model_close(self):
472
+ client = getattr(self.model, "client", None)
473
+ if not client:
474
+ return
475
+
476
+ if hasattr(client, "close"):
477
+ client.close()
478
+ elif hasattr(client, "aclose"):
479
+ import asyncio
480
+ asyncio.run(client.aclose())
481
+
482
+ def __exit__(self, exc_type, exc_val, exc_tb):
483
+ self.model_close()
app/services/text_splitter.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
2
+ from langchain_core.documents import Document
3
+ from typing import List, Optional, Literal
4
+ from app.core.config import settings
5
+
6
+ class TextSplitter:
7
+ """
8
+ A service class for splitting documents into smaller chunks using recursive character text splitting.
9
+
10
+ This class provides flexible text splitting capabilities with support for different document types
11
+ and customizable chunk sizes and overlaps.
12
+ """
13
+
14
+ def __init__(
15
+ self,
16
+ chunk_size: int = None,
17
+ chunk_overlap: int = None,
18
+ length_function: callable = len,
19
+ is_separator_regex: bool = False,
20
+ separators: Optional[List[str]] = None,
21
+ keep_separator: bool = True
22
+ ):
23
+ """
24
+ Initialize the TextSplitter with configurable parameters.
25
+
26
+ Args:
27
+ chunk_size: Maximum size of chunks to return (default: from settings)
28
+ chunk_overlap: Overlap in characters between chunks (default: from settings)
29
+ length_function: Function to measure chunk length (default: len)
30
+ is_separator_regex: Whether separators are regex patterns (default: False)
31
+ separators: List of separators to split on (default: None, uses default separators)
32
+ keep_separator: Whether to keep separators in chunks (default: True)
33
+ """
34
+ # Use settings as defaults
35
+ self.chunk_size = chunk_size if chunk_size is not None else settings.chunk_size
36
+ self.chunk_overlap = chunk_overlap if chunk_overlap is not None else settings.chunk_overlap
37
+ self.length_function = length_function
38
+ self.is_separator_regex = is_separator_regex
39
+ self.keep_separator = keep_separator
40
+
41
+ # Use custom separators if provided, otherwise use default
42
+ self.separators = separators if separators is not None else [
43
+ "\n\n", # Double newline (paragraphs)
44
+ "\n", # Single newline
45
+ " ", # Space
46
+ "" # Character-level split as last resort
47
+ ]
48
+
49
+ self._initialize_splitter()
50
+
51
+ def _initialize_splitter(self):
52
+ """Initialize the RecursiveCharacterTextSplitter with current settings."""
53
+ self.splitter = RecursiveCharacterTextSplitter(
54
+ chunk_size=self.chunk_size,
55
+ chunk_overlap=self.chunk_overlap,
56
+ length_function=self.length_function,
57
+ is_separator_regex=self.is_separator_regex,
58
+ separators=self.separators,
59
+ keep_separator=self.keep_separator
60
+ )
61
+
62
+ def split_documents(self, documents: List[Document]) -> List[Document]:
63
+ """
64
+ Split a list of documents into smaller chunks.
65
+
66
+ Args:
67
+ documents: List of Document objects to split
68
+
69
+ Returns:
70
+ List of Document objects representing the chunks
71
+ """
72
+ return self.splitter.split_documents(documents)
73
+
74
+ def split_text(self, text: str) -> List[str]:
75
+ """
76
+ Split a single text string into smaller chunks.
77
+
78
+ Args:
79
+ text: Text string to split
80
+
81
+ Returns:
82
+ List of text chunks
83
+ """
84
+ return self.splitter.split_text(text)
85
+
86
+ def create_document(
87
+ self,
88
+ text: str,
89
+ metadata: dict
90
+ ):
91
+ return Document(page_content=text, metadata=metadata)
92
+
93
+ def create_documents(
94
+ self,
95
+ texts: List[str],
96
+ metadatas: Optional[List[dict]] = None
97
+ ) -> List[Document]:
98
+ """
99
+ Create Document objects from texts and split them into chunks.
100
+
101
+ Args:
102
+ texts: List of text strings to convert to documents
103
+ metadatas: Optional list of metadata dictionaries for each text
104
+
105
+ Returns:
106
+ List of Document objects representing the chunks
107
+ """
108
+ return self.splitter.create_documents(texts, metadatas)
109
+
110
+ def update_settings(
111
+ self,
112
+ chunk_size: Optional[int] = None,
113
+ chunk_overlap: Optional[int] = None,
114
+ separators: Optional[List[str]] = None
115
+ ):
116
+ """
117
+ Update splitter settings and reinitialize.
118
+
119
+ Args:
120
+ chunk_size: New chunk size (optional)
121
+ chunk_overlap: New chunk overlap (optional)
122
+ separators: New separators list (optional)
123
+ """
124
+ if chunk_size is not None:
125
+ self.chunk_size = chunk_size
126
+ if chunk_overlap is not None:
127
+ self.chunk_overlap = chunk_overlap
128
+ if separators is not None:
129
+ self.separators = separators
130
+
131
+ self._initialize_splitter()
132
+
133
+ @classmethod
134
+ def from_language(
135
+ cls,
136
+ language: Literal[
137
+ "cpp", "go", "java", "kotlin", "js", "ts", "php", "proto",
138
+ "python", "rst", "ruby", "rust", "scala", "swift", "markdown",
139
+ "latex", "html", "sol", "csharp", "cobol", "c", "lua", "perl"
140
+ ],
141
+ chunk_size: int = None,
142
+ chunk_overlap: int = None
143
+ ) -> 'TextSplitter':
144
+ """
145
+ Create a TextSplitter optimized for a specific programming language or format.
146
+
147
+ Args:
148
+ language: Programming language or format type
149
+ chunk_size: Maximum size of chunks to return (default: from settings)
150
+ chunk_overlap: Overlap in characters between chunks (default: from settings)
151
+
152
+ Returns:
153
+ TextSplitter instance configured for the specified language
154
+ """
155
+ # Use settings as defaults
156
+ chunk_size = chunk_size if chunk_size is not None else settings.chunk_size
157
+ chunk_overlap = chunk_overlap if chunk_overlap is not None else settings.chunk_overlap
158
+
159
+ splitter = RecursiveCharacterTextSplitter.from_language(
160
+ language=language,
161
+ chunk_size=chunk_size,
162
+ chunk_overlap=chunk_overlap
163
+ )
164
+
165
+ instance = cls(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
166
+ instance.splitter = splitter
167
+ return instance
168
+
169
+ @classmethod
170
+ def for_markdown(cls, chunk_size: int = None, chunk_overlap: int = None) -> 'TextSplitter':
171
+ """
172
+ Create a TextSplitter optimized for Markdown documents.
173
+
174
+ Args:
175
+ chunk_size: Maximum size of chunks to return (default: from settings)
176
+ chunk_overlap: Overlap in characters between chunks (default: from settings)
177
+
178
+ Returns:
179
+ TextSplitter instance configured for Markdown
180
+ """
181
+ return cls.from_language("markdown", chunk_size, chunk_overlap)
182
+
183
+ @classmethod
184
+ def for_code(
185
+ cls,
186
+ language: str = "python",
187
+ chunk_size: int = None,
188
+ chunk_overlap: int = None
189
+ ) -> 'TextSplitter':
190
+ """
191
+ Create a TextSplitter optimized for code documents.
192
+
193
+ Args:
194
+ language: Programming language (default: "python")
195
+ chunk_size: Maximum size of chunks to return (default: from settings)
196
+ chunk_overlap: Overlap in characters between chunks (default: from settings)
197
+
198
+ Returns:
199
+ TextSplitter instance configured for code
200
+ """
201
+ return cls.from_language(language, chunk_size, chunk_overlap)
202
+
203
+ @classmethod
204
+ def for_markdown_with_sections(
205
+ cls,
206
+ chunk_size: int = None,
207
+ chunk_overlap: int = None
208
+ ) -> 'TextSplitter':
209
+ """
210
+ Create a TextSplitter optimized for Markdown with section delimiters (---).
211
+
212
+ This splitter is designed for markdown files that use '---' as section separators
213
+ (common in frontmatter/multi-section documents). It prioritizes keeping sections
214
+ together and prevents splitting on headers, which reduces the number of small chunks.
215
+
216
+ Args:
217
+ chunk_size: Maximum size of chunks to return (default: from settings)
218
+ chunk_overlap: Overlap in characters between chunks (default: from settings)
219
+
220
+ Returns:
221
+ TextSplitter instance with custom separators for sectioned markdown
222
+ """
223
+ # Use settings as defaults
224
+ chunk_size = chunk_size if chunk_size is not None else settings.chunk_size
225
+ chunk_overlap = chunk_overlap if chunk_overlap is not None else settings.chunk_overlap
226
+
227
+ # Custom separators that respect section boundaries
228
+ # Priority: sections -> paragraphs -> sentences -> words -> characters
229
+ custom_separators = [
230
+ "---", # Section delimiter with newlines
231
+ ". ", # Sentences
232
+ " ", # Words
233
+ ]
234
+
235
+ return cls(
236
+ chunk_size=chunk_size,
237
+ chunk_overlap=chunk_overlap,
238
+ separators=custom_separators,
239
+ keep_separator=True # Keep separators to maintain structure
240
+ )
241
+
242
+ def get_chunk_info(self, documents: List[Document]) -> dict:
243
+ """
244
+ Get information about how documents will be split.
245
+
246
+ Args:
247
+ documents: List of documents to analyze
248
+
249
+ Returns:
250
+ Dictionary containing chunk statistics
251
+ """
252
+ chunks = self.split_documents(documents)
253
+
254
+ chunk_sizes = [len(chunk.page_content) for chunk in chunks]
255
+
256
+ return {
257
+ "total_documents": len(documents),
258
+ "total_chunks": len(chunks),
259
+ "average_chunk_size": sum(chunk_sizes) / len(chunk_sizes) if chunk_sizes else 0,
260
+ "min_chunk_size": min(chunk_sizes) if chunk_sizes else 0,
261
+ "max_chunk_size": max(chunk_sizes) if chunk_sizes else 0,
262
+ "configured_chunk_size": self.chunk_size,
263
+ "configured_overlap": self.chunk_overlap
264
+ }
265
+
266
+ text_splitter = TextSplitter()
app/services/vector_store.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.documents import Document
2
+ from typing import List, Optional
3
+ import json
4
+
5
+ class VectorStore:
6
+ def __init__(self, db):
7
+ self.db = db
8
+
9
+ def get(self):
10
+ return self.db.get()
11
+
12
+ def get_by_id(self, ids: list[str]):
13
+ return self.db.get(ids=ids)
14
+
15
+ def get_dict(self):
16
+ data = self.db.get()
17
+
18
+ rows = [
19
+ {
20
+ "id": id_,
21
+ "document": doc,
22
+ "metadata": meta,
23
+ }
24
+ for id_, doc, meta in zip(
25
+ data["ids"],
26
+ data["documents"],
27
+ data["metadatas"],
28
+ )
29
+ ]
30
+ print(type(rows))
31
+ return json.dumps(rows)
32
+
33
+
34
+ def similarity_search(self, query: str, filter: Optional[dict[str, str]] = None, k: Optional[int] = 5):
35
+ results = []
36
+ if(filter):
37
+ results = self.db.similarity_search(query, filter, k)
38
+ else:
39
+ results = self.db.similarity_search(query, k)
40
+ return results
41
+
42
+ def similarity_search_with_score(self, query: str, filter: Optional[dict[str, str]] = None, k: Optional[int] = 5):
43
+ results = []
44
+ if(filter):
45
+ results = self.db.similarity_search_with_score(query, filter, k)
46
+ else:
47
+ results = self.db.similarity_search_with_score(query, k)
48
+ return results
49
+
50
+ def add_documents(self, docs: List[Document], ids: Optional[List] = None):
51
+ result = []
52
+ final_docs = [doc for doc in docs if doc.page_content.strip()]
53
+
54
+ if(ids is not None):
55
+ result = self.db.add_documents(final_docs,ids)
56
+ else:
57
+ result = self.db.add_documents(final_docs)
58
+ return result
59
+
60
+ def update_document(self, document_id: str, document: Document):
61
+ # safest + guaranteed re-embedding
62
+ self.db.delete(ids=[document_id])
63
+ return self.db.add_documents([document], ids=[document_id])
64
+
65
+ def delete(self, ids: List):
66
+ self.db.delete(ids = ids)
67
+ return True
app/utils/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # from .preprocessing import preprocess, normalize, preprocess_documents, preprocess_query
2
+ # from .constants import stopwords
3
+ # from .document_helpers import get_references, create_document, create_documents, build_metadata, clean_metadata,load_json, read_json_file, get_references_v2
4
+ # from .llm_models import load_model
5
+ # from .model_factory import get_embedding_model, get_llm_model, get_local_model, get_gemini_model
app/utils/constants.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ short_words_mappings = {
2
+ "IT": "Information Technology",
3
+ "BT": "Biotechnology",
4
+ "ECE": "Electronics and Communication Engineering",
5
+ "CE": "Computer Engineering",
6
+ "dept": "Department",
7
+ "ICT": "Information and Communication Technology",
8
+ "DS": "Data Science",
9
+ "CS": "Computer Science",
10
+ "CSE": "Computer Science and Engineering",
11
+ "MCA": "Master of Computer Application",
12
+ "MSc": "Master of Science",
13
+ }
14
+
15
+ stopwords = """
16
+ a
17
+ an
18
+ the
19
+ but
20
+ if
21
+ then
22
+ else
23
+ because
24
+ so
25
+ of
26
+ to
27
+ from
28
+ in
29
+ on
30
+ at
31
+ by
32
+ for
33
+ with
34
+ about
35
+ into
36
+ over
37
+ under
38
+ between
39
+ after
40
+ before
41
+ during
42
+ through
43
+ above
44
+ below
45
+ up
46
+ down
47
+ out
48
+ off
49
+ again
50
+ further
51
+ once
52
+ only
53
+ some
54
+ any
55
+ each
56
+ few
57
+ more
58
+ most
59
+ other
60
+ such
61
+ very
62
+ """
63
+
64
+
65
+ # stopwords = """
66
+ # a
67
+ # about
68
+ # above
69
+ # after
70
+ # again
71
+ # against
72
+ # ain
73
+ # all
74
+ # am
75
+ # an
76
+ # and
77
+ # any
78
+ # are
79
+ # aren
80
+ # aren't
81
+ # as
82
+ # at
83
+ # be
84
+ # because
85
+ # been
86
+ # before
87
+ # being
88
+ # below
89
+ # between
90
+ # both
91
+ # but
92
+ # by
93
+ # can
94
+ # couldn
95
+ # couldn't
96
+ # d
97
+ # did
98
+ # didn
99
+ # didn't
100
+ # do
101
+ # does
102
+ # doesn
103
+ # doesn't
104
+ # doing
105
+ # don
106
+ # don't
107
+ # down
108
+ # during
109
+ # each
110
+ # few
111
+ # for
112
+ # from
113
+ # further
114
+ # had
115
+ # hadn
116
+ # hadn't
117
+ # has
118
+ # hasn
119
+ # hasn't
120
+ # have
121
+ # haven
122
+ # haven't
123
+ # having
124
+ # he
125
+ # he'd
126
+ # he'll
127
+ # he's
128
+ # her
129
+ # here
130
+ # hers
131
+ # herself
132
+ # him
133
+ # himself
134
+ # his
135
+ # how
136
+ # i
137
+ # i'd
138
+ # i'll
139
+ # i'm
140
+ # i've
141
+ # if
142
+ # in
143
+ # into
144
+ # is
145
+ # isn
146
+ # isn't
147
+ # it
148
+ # it'd
149
+ # it'll
150
+ # it's
151
+ # its
152
+ # itself
153
+ # just
154
+ # ll
155
+ # m
156
+ # ma
157
+ # me
158
+ # mightn
159
+ # mightn't
160
+ # more
161
+ # most
162
+ # mustn
163
+ # mustn't
164
+ # my
165
+ # myself
166
+ # needn
167
+ # needn't
168
+ # no
169
+ # nor
170
+ # not
171
+ # now
172
+ # o
173
+ # of
174
+ # off
175
+ # on
176
+ # once
177
+ # only
178
+ # or
179
+ # other
180
+ # our
181
+ # ours
182
+ # ourselves
183
+ # out
184
+ # over
185
+ # own
186
+ # re
187
+ # s
188
+ # same
189
+ # shan
190
+ # shan't
191
+ # she
192
+ # she'd
193
+ # she'll
194
+ # she's
195
+ # should
196
+ # should've
197
+ # shouldn
198
+ # shouldn't
199
+ # so
200
+ # some
201
+ # such
202
+ # t
203
+ # than
204
+ # that
205
+ # that'll
206
+ # the
207
+ # their
208
+ # theirs
209
+ # them
210
+ # themselves
211
+ # then
212
+ # there
213
+ # these
214
+ # they
215
+ # they'd
216
+ # they'll
217
+ # they're
218
+ # they've
219
+ # this
220
+ # those
221
+ # through
222
+ # to
223
+ # too
224
+ # under
225
+ # until
226
+ # up
227
+ # ve
228
+ # very
229
+ # was
230
+ # wasn
231
+ # wasn't
232
+ # we
233
+ # we'd
234
+ # we'll
235
+ # we're
236
+ # we've
237
+ # were
238
+ # weren
239
+ # weren't
240
+ # what
241
+ # when
242
+ # where
243
+ # which
244
+ # while
245
+ # who
246
+ # whom
247
+ # why
248
+ # will
249
+ # with
250
+ # won
251
+ # won't
252
+ # wouldn
253
+ # wouldn't
254
+ # y
255
+ # you
256
+ # you'd
257
+ # you'll
258
+ # you're
259
+ # you've
260
+ # your
261
+ # yours
262
+ # yourself
263
+ # yourselves
264
+ # """
app/utils/document_helpers.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.documents import Document
2
+ from pathlib import Path
3
+ from typing import Optional, List
4
+ from datetime import datetime, date
5
+ import uuid
6
+ import yaml
7
+ from app.services.text_splitter import TextSplitter
8
+ import json
9
+
10
+ # Allowed types for metadata cleaning
11
+ ALLOWED = (str, int, float, bool, list, type(None))
12
+
13
+ def get_references_v2(docs, threshold: float):
14
+ results = []
15
+ context = ""
16
+ for doc in docs:
17
+ _doc = doc.document
18
+ _similarity = doc.fused_score
19
+ # print(_similarity, threshold)
20
+ if _similarity < threshold:
21
+ continue
22
+ metadata = _doc.metadata
23
+ document = {
24
+ "title": metadata.get("title", metadata.get("name", metadata.get("topic", "untitled"))),
25
+ "chunk_index": metadata.get("chunk_index"),
26
+ "source": metadata.get("source_file", metadata.get("source", "untitled")),
27
+ "page_content": _doc.page_content,
28
+ "similarity": _similarity
29
+ }
30
+ ctx = f"""{document['title']} page_content: {document['page_content']}, from source: {document['source']}.\n\n"""
31
+ context += ctx
32
+ results.append(document)
33
+ return {
34
+ "documents": results,
35
+ "context": context
36
+ }
37
+
38
+ def get_references(docs, threshold: float):
39
+ results = []
40
+ context = ""
41
+ for doc in docs:
42
+ _doc = doc[0]
43
+ _similarity = 1 - doc[1]
44
+ if _similarity < threshold:
45
+ continue
46
+ metadata = _doc.metadata
47
+ document = {
48
+ "title": metadata.get("title", metadata.get("name", metadata.get("topic", "untitled"))),
49
+ "chunk_index": metadata.get("chunk_index"),
50
+ "source": metadata.get("source_file", metadata.get("source", "untitled")),
51
+ "page_content": _doc.page_content,
52
+ "similarity": _similarity
53
+ }
54
+ ctx = f"""
55
+ page_content: {document['page_content']}, from source: {document['source']}.
56
+ """
57
+ context += ctx
58
+ results.append(document)
59
+ return {
60
+ "documents": results,
61
+ "context": context
62
+ }
63
+
64
+ def create_documents(
65
+ chunks: List[str],
66
+ filePath: Optional[Path] = None,
67
+ built_in_metadata: Optional[dict] = {},
68
+ title: Optional[str] = None
69
+ ) -> List[Document]:
70
+ """
71
+ Create Document objects from text chunks with standard metadata (UUIDs, timestamps, indices).
72
+ Works for both files (filePath provided) and raw text (filePath=None).
73
+ """
74
+ if filePath and filePath.exists():
75
+ created_date = datetime.fromtimestamp(filePath.stat().st_ctime).isoformat()
76
+ modified_date = datetime.fromtimestamp(filePath.stat().st_mtime).isoformat()
77
+ source = filePath.name
78
+ given_title = title or filePath.stem
79
+ else:
80
+ now = datetime.now().isoformat()
81
+ created_date = now
82
+ modified_date = now
83
+ # Use existing source from metadata if available, else empty
84
+ source = built_in_metadata.get("source", "")
85
+ if not source and filePath:
86
+ source = filePath.name
87
+ given_title = title or built_in_metadata.get("title", "Untitled")
88
+
89
+ docs = []
90
+ for i, chunk in enumerate(chunks):
91
+ # Base metadata
92
+ metadata = {
93
+ "doc_id": str(uuid.uuid4()), # unique chunk id
94
+ "source": source,
95
+ "title": given_title,
96
+ "created_date": created_date,
97
+ "modified_date": modified_date,
98
+ "chunk_index": i,
99
+ }
100
+ # Merge built-in, but don't overwrite our system fields if they exist
101
+ # actually, built-in should probably take precedence for some things?
102
+ # Let's simple merge:
103
+ metadata.update(built_in_metadata)
104
+
105
+ # Ensure our critical fields are set correctly after merge (if built-in had conflict)
106
+ metadata["doc_id"] = metadata.get("doc_id", str(uuid.uuid4()))
107
+ metadata["chunk_index"] = i
108
+
109
+ doc = Document(page_content=chunk, metadata=metadata)
110
+ docs.append(doc)
111
+ return docs
112
+
113
+
114
+ def create_document(
115
+ text: str,
116
+ metadata: dict
117
+ ):
118
+ return Document(page_content=text, metadata=metadata)
119
+
120
+
121
+ def clean_metadata(metadata: dict):
122
+ cleaned = {}
123
+ for k, v in metadata.items():
124
+ if isinstance(v, (datetime, date)):
125
+ cleaned[k] = v.isoformat()
126
+ elif isinstance(v, ALLOWED):
127
+ cleaned[k] = v
128
+ else:
129
+ cleaned[k] = str(v)
130
+ return cleaned
131
+
132
+ def read_text_file(filePath: Path):
133
+ with open(filePath, "r", encoding="utf-8") as f:
134
+ content = f.read()
135
+ return content
136
+
137
+ def read_json_file(filePath: Path):
138
+ with open(filePath, 'r') as file:
139
+ data = json.load(file)
140
+ return data
141
+
142
+ def build_metadata(filePath: Optional[Path] = None, content: Optional[str] = None):
143
+ if filePath:
144
+ content = read_text_file(filePath)
145
+
146
+ parts = content.split("---", 2)
147
+
148
+ if len(parts) >= 3:
149
+ frontmatter = yaml.safe_load(parts[1]) or {}
150
+ frontmatter = clean_metadata(frontmatter)
151
+
152
+ # add file name as source always
153
+ if filePath:
154
+ frontmatter["source"] = filePath.name
155
+ elif "source" not in frontmatter:
156
+ frontmatter["source"] = ""
157
+
158
+ return {
159
+ "metadata": frontmatter,
160
+ "content": parts[2].strip()
161
+ }
162
+
163
+ else:
164
+ # Don't enforce empty source if not provided, allows external metadata to stick
165
+ meta = {}
166
+ if filePath:
167
+ meta["source"] = filePath.name
168
+
169
+ return {
170
+ "metadata": meta,
171
+ "content": content.strip()
172
+ }
173
+
174
+ def create_documents_from_text(text: str, metadata: dict = {}):
175
+ """
176
+ Create documents from raw text with automatic splitting and metadata enrichment.
177
+ """
178
+ text = text.strip()
179
+ data = build_metadata(content=text)
180
+
181
+ # 1. Smart Metadata Merge
182
+ final_metadata = data["metadata"].copy()
183
+
184
+ # Update with provided metadata
185
+ if final_metadata.get("source") == "" and metadata.get("source"):
186
+ final_metadata["source"] = metadata["source"]
187
+
188
+ # Merge regular keys
189
+ final_metadata.update({k:v for k,v in metadata.items() if k != "source"})
190
+
191
+ text = data["content"]
192
+
193
+ # 2. Split text into chunks (strings)
194
+ # Use section-aware splitter if text contains markdown section delimiters
195
+ if "\n---\n" in text or text.startswith("---\n"):
196
+ splitter = TextSplitter.for_markdown_with_sections()
197
+ else:
198
+ splitter = TextSplitter()
199
+ chunks = splitter.split_text(text)
200
+
201
+ # 3. Create documents using standard helper (adds IDs, indices, dates)
202
+ return create_documents(
203
+ chunks=chunks,
204
+ filePath=None,
205
+ built_in_metadata=final_metadata
206
+ )
207
+
208
+ def load_json(filePath: Path):
209
+ data = read_json_file(filePath=filePath)
210
+ filePath = Path(filePath)
211
+ file_name = filePath.name
212
+
213
+ metadata = {
214
+ "id": data["id"],
215
+ "title": data.get("name", data.get("title", "Untitled")),
216
+ "source": data["source"],
217
+ "source_file": file_name or "Untitled",
218
+ "created_date": datetime.now().isoformat()
219
+ }
220
+
221
+ docs= []
222
+ splitter = TextSplitter()
223
+ for key,value in data["content"].items():
224
+ ctx = splitter.split_text(value.strip())
225
+ for idx, chunk in enumerate(ctx):
226
+ if(chunk.strip() == ""):
227
+ continue
228
+ else:
229
+ chunk = f"{key}: {chunk.strip()}"
230
+ docs.append(Document(page_content=chunk, metadata={**metadata, "topic": key, "chunk_index": idx}))
231
+ return docs
app/utils/embeddings.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_huggingface import HuggingFaceEmbeddings
2
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ def get_embedding_model():
8
+ embeddings = GoogleGenerativeAIEmbeddings(
9
+ model="models/gemini-embedding-001"
10
+ )
11
+ return embeddings
app/utils/llm_models.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from langchain_community.llms import LlamaCpp
3
+ from langchain_community.chat_models import ChatLlamaCpp
4
+ from app.core.config import settings
5
+ model_file = Path(settings.model_path) / settings.local_model_name
6
+
7
+ def load_model():
8
+ return ChatLlamaCpp(
9
+ model_path=str(model_file), # Direct path
10
+ n_ctx=8192,
11
+ n_batch=512,
12
+ n_threads=4,
13
+ temperature=0.05,
14
+ top_p=0.8,
15
+ top_k=20,
16
+ repeat_penalty=1.1,
17
+ f16_kv=True,
18
+ verbose=False,
19
+ )
20
+
app/utils/model_factory.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model factory for creating LLM and embedding models.
3
+ Handles model switching and fallback logic.
4
+ """
5
+ from typing import Optional
6
+ from pathlib import Path
7
+ from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
8
+ from langchain_community.chat_models import ChatLlamaCpp
9
+ from app.core.config import settings
10
+ import logging
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def get_embedding_model():
16
+ """
17
+ Get the embedding model (currently only Gemini).
18
+
19
+ Returns:
20
+ GoogleGenerativeAIEmbeddings: Embedding model instance
21
+ """
22
+ try:
23
+ embeddings = GoogleGenerativeAIEmbeddings(
24
+ model=settings.embedding_model_name,
25
+ google_api_key=settings.google_api_key
26
+ )
27
+ logger.info(f"Loaded embedding model: {settings.embedding_model_name}")
28
+ return embeddings
29
+ except Exception as e:
30
+ logger.error(f"Failed to load embedding model: {e}")
31
+ raise
32
+
33
+
34
+ def get_gemini_model():
35
+ """
36
+ Get Google Gemini chat model.
37
+
38
+ Returns:
39
+ ChatGoogleGenerativeAI: Gemini model instance
40
+ """
41
+ try:
42
+ model = ChatGoogleGenerativeAI(
43
+ model=settings.gemini_model_name,
44
+ google_api_key=settings.google_api_key,
45
+ )
46
+ logger.info(f"Loaded Gemini model: {settings.gemini_model_name}")
47
+ return model
48
+ except Exception as e:
49
+ logger.error(f"Failed to load Gemini model: {e}")
50
+ raise
51
+
52
+
53
+ def get_local_model():
54
+ """
55
+ Get local Qwen model (LlamaCpp).
56
+
57
+ Returns:
58
+ ChatLlamaCpp: Local model instance
59
+ """
60
+ try:
61
+ model_file = settings.model_path / settings.local_model_name
62
+
63
+ if not model_file.exists():
64
+ raise FileNotFoundError(
65
+ f"Model file not found: {model_file}\n"
66
+ f"Please download it to {settings.model_path}/"
67
+ )
68
+
69
+ # model = ChatLlamaCpp(
70
+ # model_path=str(model_file),
71
+ # n_ctx=4096, # Context window size
72
+ # n_batch=512, # Batch size for prompt processing
73
+ # n_threads=4, # Number of CPU threads
74
+ # max_tokens=settings.local_max_tokens, # Maximum tokens to generate
75
+ # temperature=0.05, # Low temperature for more focused responses
76
+ # top_p=0.8, # Nucleus sampling
77
+ # top_k=20, # Top-k sampling
78
+ # repeat_penalty=1.1, # Penalty for repetition
79
+ # f16_kv=True, # Use half-precision for KV cache
80
+ # verbose=False,
81
+ # )
82
+ model = ChatLlamaCpp(
83
+ model_path=str(model_file),
84
+ n_ctx=8096, # Small context to fit ~2GB total RAM usage [web:14]
85
+ n_batch=512, # Smaller batch for low memory throughput
86
+ n_threads=4, # Conservative threads (avoid RAM thrashing on 4GB) [web:12]
87
+ max_tokens= settings.local_max_tokens, # Short responses keep memory low
88
+ temperature=0.1, # Focused output, less randomness
89
+ top_p=0.9,
90
+ top_k=30,
91
+ repeat_penalty=1.05,
92
+ f16_kv=True, # Essential half-precision KV cache [web:14]
93
+ f16=True, # Full f16 where possible
94
+ verbose=True,
95
+ chat_format="chatml", # Proper templating
96
+ # Low-RAM must-haves:
97
+ numa=False, # Disable NUMA for single-CPU setups
98
+ use_mlock=False, # Skip memory locking (saves overhead)
99
+ use_mmap=True, # Memory-map model file (streams from disk)
100
+ )
101
+ # model = ChatLlamaCpp(
102
+ # model_path=str(model_file),
103
+ # n_ctx=4096, # Small context to fit ~2GB total RAM usage [web:14]
104
+ # n_batch=512, # Smaller batch for low memory throughput
105
+ # n_threads=4, # Conservative threads (avoid RAM thrashing on 4GB) [web:12]
106
+ # max_tokens= settings.local_max_tokens, # Short responses keep memory low
107
+ # temperature=0.1, # Focused output, less randomness
108
+ # top_p=0.9,
109
+ # min_p=0.15,
110
+ # top_k=30,
111
+ # repeat_penalty=1.05,
112
+ # f16_kv=True, # Essential half-precision KV cache [web:14]
113
+ # f16=True, # Full f16 where possible
114
+ # verbose=False,
115
+ # chat_format="qwen", # Proper templating,
116
+ # verbos=True
117
+ # )
118
+ logger.info(f"Loaded local model: {settings.local_model_name}")
119
+ return model
120
+ except Exception as e:
121
+ logger.error(f"Failed to load local model: {e}")
122
+ raise
123
+
124
+
125
+ def get_llm_model(provider: Optional[str] = None):
126
+ """
127
+ Get LLM model based on configuration with fallback support.
128
+
129
+ Args:
130
+ provider: Override the default provider ("gemini" or "local")
131
+ If None, uses settings.llm_provider
132
+
133
+ Returns:
134
+ LLM model instance (Gemini or Local)
135
+
136
+ Raises:
137
+ RuntimeError: If all models fail to load
138
+ """
139
+ provider = provider or settings.llm_provider
140
+
141
+ if provider == "gemini":
142
+ print("gemini loaded")
143
+ try:
144
+ return get_gemini_model()
145
+ except Exception as e:
146
+ logger.warning(f"Gemini model failed: {e}")
147
+ if settings.enable_fallback:
148
+ logger.info("Falling back to local model...")
149
+ return get_local_model()
150
+ raise
151
+
152
+ elif provider == "local":
153
+ print("local loaded")
154
+ try:
155
+ return get_local_model()
156
+ except Exception as e:
157
+ logger.warning(f"Local model failed: {e}")
158
+ if settings.enable_fallback:
159
+ logger.info("Falling back to Gemini model...")
160
+ return get_gemini_model()
161
+ raise
162
+
163
+ else:
164
+ raise ValueError(f"Unknown provider: {provider}. Use 'gemini' or 'local'")
app/utils/preprocessing.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from .constants import stopwords, short_words_mappings
3
+ from langchain_core.documents import Document
4
+ from app.utils.model_factory import get_local_model
5
+ from nltk.stem import PorterStemmer
6
+ import spacy
7
+ from pathlib import Path
8
+ import re
9
+
10
+ nlp = spacy.load('en_core_web_sm')
11
+
12
+ def lowercase(text: str):
13
+ return text.strip()
14
+
15
+ def tokenization(text: str):
16
+ if text is None or len(text) == 0:
17
+ return []
18
+ results = lowercase(text).split(" ")
19
+ return results
20
+
21
+ def stop_words_removal(text: str, short_words_mapping: bool = False):
22
+ if not text:
23
+ return []
24
+
25
+ doc = nlp(text)
26
+ results = []
27
+
28
+ for token in doc:
29
+ if token.is_space:
30
+ continue
31
+ if token.pos_ not in ["NOUN", "PROPN", "VERB", "NUM", "ADJ"]:
32
+ continue
33
+ word = token.text.lower()
34
+
35
+ if short_words_mapping and word in short_words_mappings:
36
+ word = short_words_mappings[word]
37
+ doc2 = nlp(word)
38
+ lemma = doc2[0].lemma_
39
+ else:
40
+ lemma = token.lemma_
41
+
42
+ lemma = lemma.strip().lower()
43
+
44
+ if lemma and lemma not in stopwords:
45
+ results.append(lemma)
46
+
47
+ return results
48
+
49
+ def space_removal(words: List[str]):
50
+ results = []
51
+ for word in words:
52
+ word = word.strip()
53
+ if(word == ""):
54
+ continue
55
+ results.append(word.strip())
56
+ return results
57
+
58
+ def preprocess(text: str, short_words_mapping: bool = False) -> str:
59
+ if text is None or len(text) == 0:
60
+ raise ValueError("Text cannot be empty")
61
+ stop_words_removed_chunks = stop_words_removal(text, short_words_mapping)
62
+ return " ".join(stop_words_removed_chunks)
63
+
64
+ def normalize(text: str) -> str:
65
+ if text is None or len(text) == 0:
66
+ raise ValueError("Text cannot be empty")
67
+ chunks = tokenization(text)
68
+ r1 = space_removal(chunks)
69
+ return " ".join(r1)
70
+
71
+ def preprocess_document(doc: Document):
72
+ if(doc.page_content == ""):
73
+ return
74
+ doc.page_content = preprocess(doc.page_content)
75
+
76
+ def preprocess_documents(docs: List[Document]):
77
+ for doc in docs:
78
+ preprocess_document(doc)
79
+
80
+ def preprocess_query(query: str) -> str:
81
+ if query is None or len(query.strip()) == 0:
82
+ raise ValueError("Query cannot be empty")
83
+
84
+ # model = get_local_model()
85
+
86
+ # prompt = f"""Rewrite this query for better semantic search/embeddings:
87
+ # Make it more descriptive, clear, natural. Keep core intent.
88
+ # Query: "{query}"
89
+ # Improved:
90
+ # """
91
+ # response = model.invoke(prompt)
92
+ # cleaned = re.sub(r'^\s*Improved:\s*', '', response.content.strip(), flags=re.IGNORECASE).strip()
93
+ return normalize(query)
94
+
95
+ def preprocess_filename(filePath: Path) -> str:
96
+ file_name = filePath.name
97
+ name = Path(file_name).stem
98
+ ext = Path(file_name).suffix.lower()
99
+
100
+ # Remove special characters but keep letters, numbers, _ and -
101
+ safe_name = re.sub(r'[^a-zA-Z0-9_-]', '', name)
102
+ # Convert to lowercase
103
+ safe_name = safe_name.lower()
104
+ # Fallback if name becomes empty (e.g. "!!!.pdf")
105
+ if not safe_name:
106
+ safe_name = "file"
107
+ return safe_name + ext
app/utils/tests.py ADDED
The diff for this file is too large to render. See raw diff