Spaces:

harshvisualz
/

vgecbot

Running

App Files Files Community

harsh-dev commited on Apr 9

Commit

4225666

1 Parent(s): c44ea2b

docker deployment

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +60 -0
.env.example +3 -0
.gitattributes +2 -34
.gitignore +57 -0
CODEBASE_DOCUMENTATION.md +673 -0
College_Overview2.md +29 -0
DOCUMENTATION_PLAN.md +277 -0
Dockerfile +66 -0
LOCAL_MODEL_TRUNCATION_FIX.md +136 -0
MARKDOWN_FIX_SUMMARY.md +118 -0
README.md +65 -10
WHY_LOCAL_NOT_WORKING.md +112 -0
app/__init__.py +0 -0
app/api/__init__.py +5 -0
app/api/dependencies.py +41 -0
app/api/routes/__init__.py +5 -0
app/api/routes/rag.py +186 -0
app/api/routes/settings.py +186 -0
app/api/routes/vector_store.py +311 -0
app/api/schemas/__init__.py +1 -0
app/api/schemas/requests.py +71 -0
app/api/schemas/settings.py +54 -0
app/api/schemas/tests.py +30 -0
app/core/__init__.py +0 -0
app/core/config.py +55 -0
app/core/paths.py +10 -0
app/main.py +21 -0
app/models/__init__.py +0 -0
app/prompts/__init__.py +1 -0
app/prompts/system_prompts.py +112 -0
app/services/__init__.py +2 -0
app/services/classifier_service.py +337 -0
app/services/document_loader.py +34 -0
app/services/file_service.py +198 -0
app/services/filter-demo +197 -0
app/services/filter_classifier copy.py +334 -0
app/services/filter_classifier.py +529 -0
app/services/hybrid_retrieval.py +354 -0
app/services/ingestion_service.py +95 -0
app/services/rag_service.py +483 -0
app/services/text_splitter.py +266 -0
app/services/vector_store.py +67 -0
app/utils/__init__.py +5 -0
app/utils/constants.py +264 -0
app/utils/document_helpers.py +231 -0
app/utils/embeddings.py +11 -0
app/utils/llm_models.py +20 -0
app/utils/model_factory.py +164 -0
app/utils/preprocessing.py +107 -0
app/utils/tests.py +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,60 @@

+# ─── Python ───────────────────────────────────────────────────────────────────
+__pycache__/
+*.py[cod]
+*$py.class
+*.egg-info/
+.eggs/
+# ─── Virtual environments ─────────────────────────────────────────────────────
+.venv/
+venv/
+env/
+# ─── Environment / secrets ────────────────────────────────────────────────────
+.env
+.env.*
+# ─── Git ──────────────────────────────────────────────────────────────────────
+.git/
+.gitignore
+# ─── Large LLM model files (not needed — Gemini-only mode) ───────────────────
+ml_models/llm/
+ml_models/embeddings/bge-small/
+# ─── Dev/test files not needed in production ──────────────────────────────────
+tests/
+docs/
+results/
+temp/
+old/
+scripts/
+dump/
+# ─── Root-level scratch/demo scripts ─────────────────────────────────────────
+bm25.py
+cfs.py
+classifier-demo.py
+fileService.py
+hybrid_search.py
+rewrite_query.py
+testSearch.py
+test_json_spliting.py
+test_markdown_splitter.py
+# ─── Large PDF files ──────────────────────────────────────────────────────────
+*.pdf
+# ─── Documentation ────────────────────────────────────────────────────────────
+*.md
+!readme.md
+# ─── IDE / OS ─────────────────────────────────────────────────────────────────
+.vscode/
+.idea/
+*.swp
+.DS_Store
+Thumbs.db
+# ─── Second requirements file (unused) ───────────────────────────────────────
+req.txt

.env.example ADDED Viewed

	@@ -0,0 +1,3 @@

+GOOGLE_API_KEY=
+LLM_PROVIDER=gemini  # or "local"
+ENABLE_FALLBACK=true

.gitattributes CHANGED Viewed

@@ -1,38 +1,6 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
 *.gguf filter=lfs diff=lfs merge=lfs -text
 *.sqlite3 filter=lfs diff=lfs merge=lfs -text
 *.pdf filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
 *.gguf filter=lfs diff=lfs merge=lfs -text
 *.sqlite3 filter=lfs diff=lfs merge=lfs -text
 *.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,57 @@

+./models
+.venv
+cache
+__pycache__
+.env
+Qwen2.5-0.5B-Instruct-Q4_K_M.gguf
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+.venv/
+venv/
+*.egg-info/
+# Environment
+.env
+# Data (runtime files)
+data/
+ml_models/
+# IDE
+.vscode/
+.idea/
+*.swp
+# Temporary files
+temp/
+*.tmp
+# OS
+.DS_Store
+Thumbs.db
+# ML model files (large binary files)
+ml_models/**/*.gguf
+ml_models/**/*.bin
+ml_models/**/*.safetensors
+# Keep directory structure
+!ml_models/.gitkeep
+!ml_models/llm/.gitkeep
+# In .gitignore — add these exceptions:
+!data/
+!data/vector_stores/
+!data/vector_stores/classifier_test_1/
+!data/vector_stores/classifier_test_1/**
+!ml_models/
+!ml_models/classifier/
+!ml_models/classifier/chatbot_classifier.pkl
+!ml_models/embeddings/mdbr-leaf-mt/
+!ml_models/embeddings/mdbr-leaf-mt/**
+!data/classifier_test_1.json

CODEBASE_DOCUMENTATION.md ADDED Viewed

	@@ -0,0 +1,673 @@

+# VGEC RAG Chatbot — Codebase Documentation
+> **Generated:** 2026-03-25
+> **Version:** 1.0.0
+> **Scope:** Full system — ingestion, retrieval, classification, API, evaluation
+---
+## Table of Contents
+1. [Project Overview](#1-project-overview)
+2. [System Architecture](#2-system-architecture)
+3. [Schema & Data Model](#3-schema--data-model)
+4. [Retrieval Pipeline](#4-retrieval-pipeline)
+5. [Key Classes & Modules](#5-key-classes--modules)
+6. [Evaluation & Metrics](#6-evaluation--metrics)
+7. [Known Limitations](#7-known-limitations)
+8. [File Structure](#8-file-structure)
+---
+## 1. Project Overview
+### Purpose
+**VGEC RAG Chatbot** is a Retrieval-Augmented Generation (RAG) chatbot for **Vishwakarma Government Engineering College (VGEC), Chandkheda, Gujarat**. It allows students, faculty, and visitors to query structured information about the institution — departments, faculty, syllabus, labs, intake capacity, and more — through natural language.
+### Domain
+- **Institution:** VGEC (Government Engineering College, Gujarat)
+- **Data Coverage:** Department-level information for multiple disciplines (Computer Engineering, Civil, Electrical, IT, ECE, etc.)
+- **Topics:** Faculty lists, lab facilities, syllabus details, HOD info, research activities, intake capacity, achievements
+### Tech Stack
+| Layer | Technology |
+|---|---|
+| **API Framework** | FastAPI |
+| **Vector Database** | ChromaDB (persistent, local) |
+| **Embeddings** | Google `gemini-embedding-001` (via `langchain-google-genai`) |
+| **LLM (Cloud)** | Google Gemini `gemini-2.5-flash-lite` |
+| **LLM (Local)** | `EXAONE-3.5-2.4B-Instruct-Q4_K_M.gguf` via `llama-cpp-python` |
+| **NLP / Preprocessing** | spaCy (`en_core_web_sm`), NLTK (PorterStemmer) |
+| **Classifier** | Scikit-learn `LogisticRegression` + `SentenceTransformer` (`MongoDB/mdbr-leaf-mt`) |
+| **BM25** | `langchain-community` `BM25Retriever` |
+| **Chunking** | LangChain `RecursiveCharacterTextSplitter` |
+| **Config** | Pydantic `BaseSettings` (`.env`-backed) |
+### Key Features Implemented
+- ✅ Structured JSON ingestion with intent-aware chunking
+- ✅ Hybrid retrieval: BM25 + vector search fused via Reciprocal Rank Fusion (RRF)
+- ✅ Intent/metadata classification with confidence-gated ChromaDB filters
+- ✅ Abbreviation expansion (`CE` → `Computer Engineering`, etc.)
+- ✅ Multi-turn conversation history support
+- ✅ Dual LLM backend with automatic fallback (Gemini ↔ Local)
+- ✅ Full CRUD REST API for vector store management
+- ✅ Offline evaluation endpoint (MRR, hit rate, noise rate)
+- ✅ Classifier accuracy evaluation endpoint
+---
+## 2. System Architecture
+### Component Diagram
+```
+                         ┌──────────────────────────┐
+                         │        FastAPI App         │
+                         │  /api/v1/rag   /vector    │
+                         └──────────┬───────────────┘
+                                    │ DI (lru_cache)
+                         ┌──────────▼───────────────┐
+                         │        RAGService          │
+                         │  (core orchestrator)       │
+                         └──┬───────────┬────────────┘
+                            │           │
+              ┌─────────────▼──┐   ┌───▼──────────────────┐
+              │ IngestionService│   │  HybridRetrievalService│
+              │  (write path)  │   │   (read path)          │
+              └──────┬──────── ┘   └───┬──────────┬─────── ┘
+                     │                 │           │
+          ┌──────────▼──┐   ┌──────────▼──┐  ┌────▼──────────┐
+          │  FileService │   │ ClassifierSvc│  │  VectorStore  │
+          │ (file +meta) │   │(clf predict) │  │  (ChromaDB)   │
+          └──────────────┘   └─────────────┘  └───────────────┘
+```
+### Data Flow
+#### Ingestion Path
+```
+File Upload (PDF/MD/TXT/JSON)
+   │
+   ▼
+FileService.read_file()          ← type-aware loading (PyMuPDF for PDF)
+   │  returns: Document + metadata
+   ▼
+FileService.write_file()         ← persist copy to data/documents/
+   │
+   ▼
+IngestionService.handle_*_docs() ← route by file extension
+   │
+   ├─ JSON → handle_json_docs()  ← intent-aware chunks (list / detail / count)
+   └─ text → handle_text_docs()  ← RecursiveCharacterTextSplitter + normalize()
+   │
+   ▼
+VectorStore.add_documents()      ← embed + upsert into ChromaDB
+   │
+   ▼
+FileService.patch_metadata()     ← update ingestion record JSON (chunk count, timing, size)
+```
+#### Query Path
+```
+User Question
+   │
+   ▼
+preprocess_query()               ← tokenize + strip stopwords (spaCy) + normalize
+   │
+   ▼
+HybridRetrievalService.retrieve()
+   │
+   ├─ clf.expand_abbreviations() ← CE → Computer Engineering
+   ├─ clf.predict_with_filter()  ← LogReg predict → Chroma $and/$or filter
+   ├─ _vector_rank()             ← ChromaDB similarity_search_with_score (k=15)
+   ├─ _bm25_rank()               ← BM25 over the vector candidate pool
+   ├─ _reciprocal_rank_fusion()  ← weighted RRF merge
+   ├─ metadata score boosting    ← multiply fused scores for confident matches
+   └─ _apply_title_boost()       ← per-query-word title match bonus
+   │
+   ▼
+get_references_v2()              ← filter by threshold, build context string
+   │
+   ▼
+LLM.invoke(prompt)               ← Gemini or local LlamaCpp
+   │
+   ▼
+Return: { answer, references, context, threshold_used, k_used }
+```
+### External Dependencies
+| Dependency | Role | Provider |
+|---|---|---|
+| ChromaDB | Persistent vector store | Local disk |
+| Google Gemini API | Embeddings + LLM generation | Google Cloud |
+| LlamaCpp (GGUF model) | Local LLM fallback | Local CPU |
+| Sentence Transformers | Classifier feature extraction | HuggingFace Hub |
+| spaCy `en_core_web_sm` | POS tagging / lemmatization | Local |
+---
+## 3. Schema & Data Model
+### Source JSON Format
+Source data files (e.g. `computer_eng.json`) follow this schema:
+```json
+{
+  "id": "computer-engineering-department",
+  "name": "Computer Engineering Department",
+  "source": "https://www.vgecg.ac.in/department.php?dept=3",
+  "category": "computer_eng",
+  "type": "department",
+  "created_date": "2026-02-19",
+  "content": {
+    "<topic_key>": {
+      "list": ["item 1", "item 2", "..."],
+      "details": "Paragraph describing the topic."
+    }
+  }
+}
+```
+**Top-level fields:**
+| Field | Type | Description |
+|---|---|---|
+| `id` | string | Unique document identifier |
+| `name` | string | Human-readable institution/department name |
+| `source` | string | Authoritative URL |
+| `category` | string | Department slug (e.g. `computer_eng`) |
+| `type` | string | Document type (e.g. `department`) |
+| `created_date` | string (ISO) | Data creation date |
+| `content` | object | Topic map; each key = a topic |
+### Chunk Metadata Schema (stored in ChromaDB)
+Every vector chunk stored in Chroma carries the following metadata:
+| Field | Type | Source |
+|---|---|---|
+| `id` | string (UUID) | Auto-generated |
+| `title` | string | Document name / topic key |
+| `source` | string | Source URL |
+| `source_file` | string | Filename (e.g. `computer_eng.json`) |
+| `type` | string | Taxonomy level 1 (e.g. `department`) |
+| `category` | string | Taxonomy level 2 (e.g. `computer_eng`) |
+| `topic` | string | Taxonomy level 3 (e.g. `faculty`) |
+| `intent` | string | Chunk intent: `list`, `detail`, or `count` |
+| `chunk_index` | int | Sequential index within file |
+| `created_date` | string (ISO) | Ingestion timestamp |
+| `updated_at` | string (ISO) | Last modification timestamp |
+| `ext` | string | Source file extension (`json`, `pdf`, `md`, `txt`) |
+### Hierarchical Taxonomy
+The classifier predicts and ChromaDB filters operate on a 3-level hierarchy:
+```
+type
+ └── category
+      └── topic
+           └── intent  (list | detail | count)
+```
+**Example mapping (Computer Engineering):**
+```
+type: "department"
+  └── category: "computer_eng"
+         ├── topic: "faculty"    → intent: list | detail
+         ├── topic: "lab"        → intent: list | detail
+         ├── topic: "syllabus"   → intent: list | detail
+         ├── topic: "hod"        → intent: list | detail
+         ├── topic: "intake"     → intent: list | detail
+         ├── topic: "research"   → intent: list | detail
+         └── topic: "achievements"
+```
+### Document Chunking Strategy
+**JSON documents** use a hand-crafted, intent-aware strategy in `IngestionService.handle_json_docs()`:
+| Intent | Chunk Content | Metadata |
+|---|---|---|
+| `list` | Numbered list: `1. item\n2. item\n...` | `intent=list` |
+| `count` | `"Total <topic>: N"` (auto-generated) | `intent=count` |
+| `detail` | Raw paragraph text | `intent=detail` |
+**Text/PDF/Markdown documents** use `RecursiveCharacterTextSplitter`:
+- Default: `chunk_size=500`, `chunk_overlap=100`
+- Separator priority: `\n\n` → `\n` → ` ` → (character)
+- Markdown variant respects `---` section delimiters
+- Content is passed through `normalize()` (tokenize + strip blanks) before storage
+---
+## 4. Retrieval Pipeline
+### Query Processing Flow
+```python
+# Step 1: Normalize input
+question = preprocess_query(question)
+# → spaCy POS filter (NOUN, PROPN, VERB, NUM, ADJ) + lemmatize + strip stopwords
+# Step 2: Expand abbreviations
+processed_query = clf.expand_abbreviations(query)
+# → "CE dept" → "computer engineering department"
+# Step 3: Classify intent/metadata
+filters = clf.predict_with_filter([processed_query])
+# → {"$and": [{"type": "department"}, {"intent": "list"}, {"$or": [...]}]}
+# Step 4: Vector search with optional filter
+raw_results = chroma.similarity_search_with_score(query, k=15, filter=filters)
+# Fallback: if filtered results empty, retry without filter
+# Step 5: BM25 re-rank over vector candidates
+bm25_results = BM25Retriever.from_documents(candidate_docs)
+# Step 6: RRF fusion
+fused_score(d) = bm25_weight * 1/(rrf_k + rank_bm25)
+              + vector_weight * 1/(rrf_k + rank_vec)
+# Step 7: Metadata confidence boosting
+if doc.metadata[field] == predicted_val and conf > 0.90:
+    result.fused_score *= boost_factor  # 1.10–1.20
+# Step 8: Title word boost
+for word in query_words:
+    if word in doc.title:
+        result.fused_score += title_boost_per_word  # 0.004
+# Step 9: Threshold filter + sort + top-k
+results = [r for r in results if r.fused_score >= threshold]
+```
+### Classifier Thresholds
+The `Classifier` uses two separate threshold tables:
+**Prediction threshold** — below this, the field is set to `None` (not used at all):
+| Field | Threshold |
+|---|---|
+| `type` | 0.40 |
+| `category` | 0.40 |
+| `topic` | 0.50 |
+| `intent` | 0.60 |
+**Filter threshold** — above this, the field becomes a hard ChromaDB `$and` filter:
+| Field | Threshold |
+|---|---|
+| `type` | 0.65 |
+| `category` | 0.65 |
+| `topic` | 0.70 |
+### Filter Construction Logic (`_build_filter`)
+```python
+# Gate: if type confidence < 0.65 → return None (full scan)
+# Hard anchors (always included if type passes):
+#   - type == predicted_type
+#   - intent == predicted_intent  (special: "count" expands to count OR detail)
+# Soft hints (combined as $or):
+#   - category == predicted_category  (if conf >= 0.65, else "general")
+#   - topic == predicted_topic        (if conf >= 0.70, else "general")
+```
+### Hybrid Retrieval Config (Defaults)
+| Parameter | `hybrid_query` | `search_docs` |
+|---|---|---|
+| `candidate_k` | 15 | 15 |
+| `top_k` (final) | `settings.similarity_top_k` (8) | k (param) |
+| `bm25_weight` | 0.45 | 0.70 |
+| `vector_weight` | 0.55 | 0.30 |
+| `rrf_k` | 20 | 20 |
+| `bm25_k1` | 1.2 | 1.5 |
+| `bm25_b` | 0.9 | 0.75 |
+| `title_boost_per_word` | 0.004 | 0.004 |
+| `score_threshold` | 0.4 | 0.4 |
+> **Note:** `search_docs` is BM25-heavy (0.70) since it is used for keyword-oriented document browsing, while `hybrid_query` is vector-heavy for semantic QA.
+---
+## 5. Key Classes & Modules
+### Services (`app/services/`)
+#### `RAGService`
+Main orchestrator. Singleton via `lru_cache` in `dependencies.py`.
+| Method | Description |
+|---|---|
+| `query()` | Semantic-only QA (vector search → LLM) |
+| `hybrid_query()` | Hybrid QA (BM25 + vector → RRF → LLM) |
+| `search_docs()` | BM25-heavy document search, no LLM |
+| `ingest_documents()` | Ingest a file path into the vector store |
+| `get_filenames()` | Return all tracked file metadata records |
+| `test_queries()` | Batch retrieval evaluation (MRR, precision, noise) |
+| `test_classifier()` | Batch classifier accuracy evaluation |
+| `delete_database()` | Drop the entire ChromaDB collection |
+#### `HybridRetrievalService`
+Stateless per-request service created inline by `RAGService`.
+| Method | Description |
+|---|---|
+| `retrieve(query)` | Full hybrid retrieval pipeline; returns `List[RetrievalResult]` |
+| `_vector_rank()` | Chroma similarity search + classifier filter |
+| `_bm25_rank()` | BM25 over candidate pool |
+| `_reciprocal_rank_fusion()` | Merge both ranked lists via RRF |
+| `_apply_title_boost()` | Word-level title match score bonus |
+**`RetrievalResult` dataclass:**
+```python
+@dataclass
+class RetrievalResult:
+    document: Document
+    fused_score: float
+    bm25_rank: Optional[int]
+    vector_rank: Optional[int]
+    title_boost: float
+```
+#### `Classifier`
+Loaded at startup from a pickled pipeline (`chatbot_classifier.pkl`).
+| Method | Description |
+|---|---|
+| `predict(queries)` | Returns list of `{type, category, topic, intent, *_conf}` dicts |
+| `predict_with_filter(queries)` | Returns a ChromaDB-compatible filter dict or `None` |
+| `expand_abbreviations(text)` | Regex-based abbreviation expansion |
+| `get_features(queries)` | Build `[SentenceTransformer embedding | TF-IDF]` feature matrix |
+| `train_models(df)` | Train 4 LogisticRegression classifiers (offline use) |
+#### `IngestionService`
+| Method | Description |
+|---|---|
+| `ingest(file_path)` | Load + chunk a file; returns `List[Document]` |
+| `handle_json_docs()` | Intent-aware chunking for structured JSON data |
+| `handle_text_docs()` | Recursive character splitting for unstructured text |
+| `get_records()` | Delegate to `FileService.get_records()` |
+| `delete_record(filename)` | Remove a file's metadata record |
+| `path_record(path, metadata)` | Patch ingestion stats after indexing |
+#### `FileService`
+| Method | Description |
+|---|---|
+| `read_file(path)` | Load file content; dispatches by extension |
+| `write_file(path, content, metadata)` | Persist file to `data/documents/` |
+| `patch_metadata(path, metadata)` | Merge new fields into existing record |
+| `get_records()` | Return all ingestion records dict |
+| `delete_record(filename)` | Remove a record from `<collection>.json` |
+#### `VectorStore`
+Thin wrapper around `langchain_chroma.Chroma`.
+| Method | Description |
+|---|---|
+| `get()` | Retrieve all documents |
+| `get_by_id(ids)` | Retrieve specific documents by ID |
+| `add_documents(docs)` | Embed + insert, skipping empty chunks |
+| `update_document(id, doc)` | Delete then re-insert with same ID |
+| `delete(ids)` | Remove documents by ID list |
+| `similarity_search_with_score()` | Wrapped Chroma search |
+### Utilities (`app/utils/`)
+#### `preprocessing.py`
+| Function | Description |
+|---|---|
+| `preprocess(text)` | spaCy POS filter + lemmatize + stopword removal → joined string |
+| `normalize(text)` | Tokenize + strip blanks (lightweight, no POS) |
+| `preprocess_query(query)` | Applies `normalize()` to user queries |
+| `preprocess_documents(docs)` | Applies `preprocess()` to a document list in-place |
+| `preprocess_filename(path)` | Sanitize filename (remove special chars, lowercase) |
+#### `document_helpers.py`
+| Function | Description |
+|---|---|
+| `get_references_v2(docs, threshold)` | Convert `RetrievalResult` list → references dict + context string |
+| `get_references(docs, threshold)` | Same for raw `(Document, distance)` tuples (used by `query()`) |
+| `build_metadata(path)` | Parse YAML frontmatter from `.md`/`.txt` files |
+| `create_documents(chunks, ...)` | Attach standard metadata (UUID, timestamps, indices) to chunks |
+| `create_documents_from_text(text)` | Full pipeline: frontmatter parse → split → metadata attach |
+| `clean_metadata(metadata)` | Serialize datetime, coerce non-allowed types to string |
+#### `model_factory.py`
+| Function | Description |
+|---|---|
+| `get_embedding_model()` | Returns `GoogleGenerativeAIEmbeddings` |
+| `get_gemini_model()` | Returns `ChatGoogleGenerativeAI` |
+| `get_local_model()` | Returns `ChatLlamaCpp` (GGUF, CPU inference) |
+| `get_llm_model(provider)` | Dispatches to Gemini or Local with fallback logic |
+### API Routes (`app/api/routes/`)
+#### `rag.py` — prefix `/api/v1/rag`
+| Method | Endpoint | Description |
+|---|---|---|
+| GET | `/` | Health check |
+| POST | `/` | Semantic query |
+| POST | `/hybrid_query` | Hybrid RAG query (primary endpoint) |
+| POST | `/similarity_search` | Hybrid retrieval, no LLM response |
+| POST | `/search` | BM25-heavy document search |
+| POST | `/test` | Batch retrieval evaluation |
+| POST | `/test_classifier` | Classifier accuracy evaluation |
+| GET | `/test_classifier_dataset` | Run built-in test dataset, cache result |
+#### `vector_store.py` — prefix `/api/v1/vector`
+| Method | Endpoint | Description |
+|---|---|---|
+| GET | `/` | List all documents (paginated, filterable) |
+| GET | `/filenames` | List ingested file records |
+| GET | `/{id}` | Get single document by ChromaDB ID |
+| POST | `/` | Upload + ingest file |
+| PUT | `/{id}` | Update document content/metadata |
+| DELETE | `/ids` | Bulk delete by ID list |
+| DELETE | `/{id}` | Delete single document |
+| DELETE | `/` | Filter-based delete (filename/source/contains) |
+### Configuration (`app/core/config.py`)
+All settings are read from `.env` via Pydantic `BaseSettings`:
+```python
+class Settings(BaseSettings):
+    # Paths
+    collection_name: str = "classifier_test_1"
+    persist_directory: str = "./data/vector_stores/classifier_test_1"
+    # Chunking
+    chunk_size: int = 500
+    chunk_overlap: int = 100
+    # Retrieval
+    similarity_top_k: int = 8
+    similarity_threshold: float = 0.4
+    # LLM Provider
+    llm_provider: Literal["gemini", "local"] = "local"
+    enable_fallback: bool = True
+    # Models
+    embedding_model_name: str = "models/gemini-embedding-001"
+    gemini_model_name: str = "gemini-2.5-flash-lite"
+    local_model_name: str = "EXAONE-3.5-2.4B-Instruct-Q4_K_M.gguf"
+    # Generation
+    max_output_tokens: int = 2048
+    local_max_tokens: int = 512
+    # Auth
+    google_api_key: str  # required — must be in .env
+```
+---
+## 6. Evaluation & Metrics
+### Retrieval Evaluation (`test_queries` / `POST /api/v1/rag/test`)
+Tests each (question, expected_document, expected_chunk_index) triple against `hybrid_query`:
+| Metric | Formula | Interpretation |
+|---|---|---|
+| **Hit Rate** | `hits / total` | % of questions where the exact chunk was retrieved |
+| **Top-1 Hit Rate** | `rank==1 hits / total` | % of questions where exact chunk was top result |
+| **MRR** | `mean(1/rank)` | Mean Reciprocal Rank; higher = correct result ranked earlier |
+| **Doc Precision** | `correct_source_chunks / all_chunks` | How many retrieved chunks came from the right document |
+| **Doc Recall** | `1 if any correct_source_chunk else 0` | Did we retrieve at least one chunk from the right document? |
+| **Doc Noise** | `wrong_source_chunks / all_chunks` | Proportion of off-topic chunks in the result set |
+| **Error Rate** | `1 - hit_rate` | Miss rate for exact chunk retrieval |
+**Test Input Schema:**
+```python
+class TestRequestSchema(BaseModel):
+    tests: List[Test]   # question + document + chunk_index
+    k: int = 5
+    threshold: float = 0.4
+```
+### Classifier Evaluation (`test_classifier` / `POST /api/v1/rag/test_classifier`)
+Evaluates predictions for all 4 classification fields (`type`, `category`, `topic`, `intent`):
+| Metric | Notes |
+|---|---|
+| **Accuracy** | `sklearn.accuracy_score` |
+| **Precision (macro)** | `zero_division=0` |
+| **Recall (macro)** | `zero_division=0` |
+| **F1 Macro** | Unweighted average across classes |
+| **F1 Weighted** | Class-frequency weighted |
+| **Classification Report** | Full per-class breakdown (`output_dict=True`) |
+A bundled test dataset is stored in `app/utils/tests.py` as `classifier_test_dataset` and can be executed via `GET /api/v1/rag/test_classifier_dataset`. Results are **memoized** on the `RAGService.evaluation` dict for the lifetime of the server process.
+---
+## 7. Known Limitations
+### Technical Debt
+- **`preprocess_query` is incomplete.** The function signature has an LLM-powered query rewriting block that is commented out. Currently it just calls `normalize()` (tokenize only), which means no stopword removal or lemmatization is applied to user queries (only to stored documents).
+- **`search_docs` does not honour `filename` as a metadata filter in Chroma.** The filter is applied in Python post-retrieval, which is inefficient for large collections.
+- **Count intent is synthetic.** The `"Total <topic>: N"` chunk is an auto-generated chunk during ingestion, not from the source document. If source data changes, stale count chunks can remain indexed.
+- **`VectorStore.get_dict()` has a `print(type(rows))`** debug statement left in production code.
+- **`FileService.__init__` docstring** has an extra backtick: `"`\`` class docstring`.
+### Planned but Unimplemented
+- **Query rewriting via local LLM** — skeleton is commented out in `preprocess_query()`.
+- **Semantic caching** — no query result memoization at the API layer.
+- **Re-ranker** — no cross-encoder re-ranking step; relies only on RRF + boosting.
+- **`topic` field is not included in the ChromaDB hard filter** — only `type` + `intent` are hard-anchored; `category` and `topic` are soft `$or` hints.
+### Performance Bottlenecks
+- **Local LLM (LlamaCpp)** is CPU-only with `n_ctx=8096` and `n_threads=4`. Response latency is high (~10–30s) on low-RAM systems.
+- **Classifier uses `SentenceTransformer` + `TF-IDF` features** — inference runs on every request with no caching of query embeddings.
+- **BM25 corpus is rebuilt from scratch per request** — `BM25Retriever.from_documents()` is called inside `_bm25_rank()` each time.
+- **`classify_test_dataset` in `app/utils/tests.py`** is a very large file (1.8MB) loaded at import time.
+- **The memoized evaluation** in `rag_service.evaluation` is not thread-safe if the server runs with multiple workers.
+---
+## 8. File Structure
+```
+VGEC-RAG-Chatbot/
+│
+├── app/                            # Application package
+│   ├── main.py                     # FastAPI app, router mounting, CORS middleware
+│   ├── core/
+│   │   ├── config.py               # Pydantic Settings (all tuneable params)
+│   │   └── paths.py                # Path constants helper
+│   │
+│   ├── api/
+│   │   ├── dependencies.py         # lru_cache singleton for RAGService
+│   │   ├── routes/
+│   │   │   ├── rag.py              # /rag endpoints (query, test, classifier)
+│   │   │   ├── vector_store.py     # /vector endpoints (CRUD for ChromaDB)
+│   │   │   └── settings.py         # /settings endpoints
+│   │   └── schemas/
+│   │       ├── requests.py         # RAGRequest, PaginationParams, etc.
+│   │       └── tests.py            # TestRequestSchema, TestClassifierReqSchema
+│   │
+│   ├── services/
+│   │   ├── rag_service.py          # RAGService (main orchestrator)
+│   │   ├── hybrid_retrieval.py     # HybridRetrievalService + RRF logic
+│   │   ├── classifier_service.py   # Classifier class + singleton clf
+│   │   ├── ingestion_service.py    # IngestionService (chunking pipeline)
+│   │   ├── file_service.py         # FileService (file I/O + metadata JSON)
+│   │   ├── vector_store.py         # VectorStore (thin ChromaDB wrapper)
+│   │   ├── text_splitter.py        # TextSplitter (RecursiveCharacter + variants)
+│   │   └── document_loader.py      # (legacy loader, not in primary path)
+│   │
+│   ├── utils/
+│   │   ├���─ preprocessing.py        # preprocess(), normalize(), preprocess_query()
+│   │   ├── document_helpers.py     # get_references_v2(), build_metadata(), create_documents()
+│   │   ├── model_factory.py        # get_llm_model(), get_embedding_model()
+│   │   ├── constants.py            # stopwords list, short_words_mappings
+│   │   ├── embeddings.py           # (thin embedding util)
+│   │   ├── llm_models.py           # (thin LLM util)
+│   │   └── tests.py                # classifier_test_dataset (large, 1.8MB)
+│   │
+│   └── prompts/
+│       └── __init__.py             # SYSTEM_PROMPT, wrap_exaone()
+│
+├── ml_models/
+│   ├── classifier/
+│   │   └── chatbot_classifier.pkl  # Pickled pipeline (models, tfidf, label encoders, etc.)
+│   ├── embeddings/                 # (Local embedding model weights, if any)
+│   └── llm/
+│       └── EXAONE-3.5-2.4B-*.gguf # Local LLM weights
+│
+├── data/
+│   ├── department_data/            # Source JSON files per department
+│   │   ├── computer_eng.json
+│   │   ├── civil.json
+│   │   └── ...
+│   ├── documents/                  # Persistent copies of ingested files
+│   ├── vector_stores/
+│   │   └── classifier_test_1/      # ChromaDB persist directory
+│   ├── classifier_test_1.json      # Ingestion metadata registry (FileService records)
+│   └── other_data/                 # Misc data files
+│
+├── temp/                           # Staging area for uploaded files (auto-cleared)
+├── scripts/                        # Offline scripts (training, testing)
+├── tests/                          # Test files
+│
+├── requirements.txt                # Pinned production dependencies
+├── .env                            # Runtime secrets (google_api_key, etc.)
+├── .env.example                    # Template for .env
+└── CODEBASE_DOCUMENTATION.md       # This file
+```
+---
+*End of documentation.*

College_Overview2.md ADDED Viewed

	@@ -0,0 +1,29 @@

+---
+title: Vishwakarma Government Engineering College
+source_url: https://www.vgecg.ac.in/index.php
+domain: https://www.vgecg.ac.in
+pathname: /index.php
+visited: 2026-02-15T12:55:40.751Z
+topic: College Overview
+---
+# College Statistics
+Description:
+This page provides some key statistics about Vishwakarma Government Engineering College.
+- **Publication:** 48046+
+- **Research Labs:** 13+
+- **Courses:** 12+
+- **Highest Package (Lacs):** 23+
+Source:
+- https://www.vgecg.ac.in/index.php
+Keywords:
+- college statistics
+- VGEC overview
+- publications
+- research labs
+- courses
+- placements

DOCUMENTATION_PLAN.md ADDED Viewed

	@@ -0,0 +1,277 @@

+# VGEC RAG Chatbot — Software Documentation Plan
+> Based on IEEE/Industry Standard | Updated: 2026-03-25
+> Reference: `CODEBASE_DOCUMENTATION.md` covers most of Phase 5 already — reuse it.
+---
+## DIAGRAMS FIRST — Priority Order
+> Do all diagrams before writing any prose. Diagrams take the most time and are referenced throughout.
+| # | Diagram | Phase Used In | Tool | Status |
+|---|---|---|---|---|
+| 1 | High-Level Architecture (Component Diagram) | Phase 5 | Draw.io / Mermaid | [ ] |
+| 2 | Data Flow — Query Path | Phase 5 | Draw.io (DFD Level 2) | [ ] |
+| 3 | Data Flow — Ingestion Path | Phase 5 | Draw.io (DFD Level 2) | [ ] |
+| 4 | Hierarchical Taxonomy Tree (Type→Category→Topic) | Phase 5 | Tree diagram / Mermaid | [ ] |
+| 5 | Filter Decision Flowchart (Strict→Partial→Fallback) | Phase 5 | Flowchart / Draw.io | [ ] |
+| 6 | Hybrid Retrieval Sequence (Vector→BM25→RRF→Boost) | Phase 5 | Sequence diagram / Flow | [ ] |
+| 7 | Use Case Diagram (Student, Faculty, Admin actors) | Phase 4 | Draw.io / PlantUML | [ ] |
+| 8 | System Context Diagram / Level 0 DFD | Phase 2 | Draw.io | [ ] |
+| 9 | Class Diagram (simplified — RAGService + helpers) | Phase 6 | Draw.io / UML | [ ] |
+| 10 | Activity Diagram — Chunking Process | Phase 6 | Activity flow / Draw.io | [ ] |
+| 11 | MRR Bar Chart — Your RAG vs Traditional | Phase 7 | matplotlib / Excel | [ ] |
+| 12 | Noise Rate Bar Chart — Comparison | Phase 7 | matplotlib / Excel | [ ] |
+| 13 | Classifier Confusion Matrix (per field) | Phase 7 | Seaborn heatmap | [ ] |
+| 14 | Deployment Diagram (Express → FastAPI → ChromaDB) | Phase 8 | Draw.io | [ ] |
+| 15 | Future Roadmap / Gantt-style Timeline | Phase 9 | Draw.io / simple table | [ ] |
+---
+## Phase 1 — Front Matter
+**Est. time: 1–2 hrs | No diagrams needed**
+- [ ] Title Page
+  - Project: VGEC RAG Chatbot
+  - Subtitle: Retrieval-Augmented Generation System for Academic Queries
+  - Name, Roll No., Department, Submission Date
+  - Guide name, College name
+- [ ] Abstract (150–200 words)
+  - Problem: Students struggle to find accurate VGEC info scattered across website
+  - Solution: RAG-based chatbot with hierarchical classification + hybrid retrieval
+  - Key results: MRR, noise reduction *(fill placeholders after deployment)*
+  - Tech: FastAPI, ChromaDB, Gemini, Logistic Regression classifier
+- [ ] Table of Contents *(auto-generate at end — structure it now)*
+- [ ] List of Figures *(auto-generate at end)*
+- [ ] List of Abbreviations
+  - RAG, BM25, RRF, LLM, MRR, API, VGEC, HOD, etc.
+---
+## Phase 2 — Introduction
+**Est. time: 2–3 hrs | Diagrams needed: System Context Diagram (Diagram #8)**
+- [ ] 2.1 Background
+  - Current state: Static website, PDFs, manual queries to admin office
+  - Pain points: Information scattered, no natural language interface
+- [ ] 2.2 Problem Statement
+  - Lack of intelligent query system for institutional data
+  - Need for domain-specific (VGEC) accurate retrieval
+- [ ] 2.3 Objectives
+  - Build RAG pipeline with >75% MRR
+  - Implement metadata classification for pre-filtering
+  - Provide REST API for frontend integration
+  - Deploy with a secure Express gateway
+- [ ] 2.4 Scope
+  - **In scope:** Department data (faculty, labs, syllabus, HOD, intake), REST API, classification, evaluation
+  - **Out of scope:** Real-time website scraping, admissions processing, multimedia
+> **Reuse from:** `CODEBASE_DOCUMENTATION.md` Section 1 (Project Overview)
+---
+## Phase 3 — Literature Review / Related Work
+**Est. time: 2–3 hrs | Diagrams needed: Evolution timeline (simple horizontal flow)**
+- [ ] 3.1 Traditional Chatbots
+  - Rule-based (ALICE, ELIZA) — rigid, no context
+  - Keyword matching chatbots — no semantic understanding
+- [ ] 3.2 Modern RAG Systems
+  - OpenAI GPT-4 + vector DB (generic, not domain-specific)
+  - LlamaIndex / LangChain baseline RAG — no metadata filtering
+- [ ] 3.3 Hybrid Search Systems
+  - Elasticsearch (BM25 only), Cohere (vector only)
+  - RRF as the standard fusion method (reference paper)
+- [ ] 3.4 Your Differentiation
+  - Hierarchical classifier (Type→Category→Topic→Intent) for pre-filtering
+  - Hybrid retrieval (BM25 + Vector + RRF) vs pure semantic search
+  - Domain-specific ingestion strategy (intent-aware JSON chunking)
+---
+## Phase 4 — System Analysis & Requirements
+**Est. time: 3–4 hrs | Diagrams needed: Use Case Diagram (#7), Level 1 DFD**
+- [ ] 4.1 Functional Requirements
+  - FR1: Ingest structured JSON and unstructured documents (PDF, MD, TXT)
+  - FR2: Classify queries into metadata filters (type, category, topic, intent)
+  - FR3: Retrieve relevant chunks with configurable similarity threshold
+  - FR4: Generate contextual answers using Gemini or local LLM
+  - FR5: Provide CRUD operations on vector store via REST API
+  - FR6: Rate-limit and authenticate requests via Express gateway
+- [ ] 4.2 Non-Functional Requirements
+  - Performance: <5s response (cloud), <30s (local LLM)
+  - Accuracy: MRR >0.75
+  - Security: Admin routes protected by JWT, Python API never publicly exposed
+  - Scalability: Support 10,000+ chunks in ChromaDB
+- [ ] 4.3 Use Case Diagram *(Diagram #7)*
+  - Actors: Student, Faculty, Admin
+  - Student use cases: Submit query, View answer, View references
+  - Admin use cases: Ingest document, Delete document, Run evaluation, Change settings
+- [ ] 4.4 Level 1 DFD
+  - Major processes: Ingest, Classify, Retrieve, Generate, Evaluate
+---
+## Phase 5 — System Design
+**Est. time: 4–6 hrs | MOST MARKS, MOST DIAGRAMS**
+**Diagrams needed: #1, #2, #3, #4, #5, #6**
+> **Reuse heavily from:** `CODEBASE_DOCUMENTATION.md` Sections 2, 3, 4
+- [ ] 5.1 Architecture Design
+  - [ ] High-Level Component Diagram *(Diagram #1)*
+  - [ ] Data Flow — Ingestion Path *(Diagram #3)*
+  - [ ] Data Flow — Query Path *(Diagram #2)*
+  - [ ] Technology Stack Table (already in CODEBASE_DOCUMENTATION.md Section 1)
+- [ ] 5.2 Database Design
+  - [ ] Vector DB Metadata Schema (field table — already in CODEBASE_DOCUMENTATION.md Section 3)
+  - [ ] Source JSON Schema (already documented)
+  - [ ] File Tracking Registry Schema (FileService JSON records)
+- [ ] 5.3 Algorithm Design
+  - [ ] Hierarchical Taxonomy Tree *(Diagram #4)* (Type → Category → Topic → Intent)
+  - [ ] Filter Decision Flowchart *(Diagram #5)* (confidence thresholds → Strict/Partial/Fallback)
+  - [ ] Hybrid Retrieval Sequence *(Diagram #6)* (Vector → BM25 → RRF formula → Boost → Threshold)
+  - [ ] Chunking Strategy (JSON intent-aware vs RecursiveCharacterTextSplitter)
+  - [ ] RRF Formula — document with the actual equation:
+    ```
+    score(d) = bm25_weight * 1/(rrf_k + rank_bm25)
+             + vector_weight * 1/(rrf_k + rank_vec)
+    ```
+- [ ] 5.4 Interface Design
+  - [ ] API Endpoint Table — /rag and /vector routes (already in CODEBASE_DOCUMENTATION.md Section 5)
+  - [ ] Request/Response JSON examples (sample curl or Postman output)
+  - [ ] Express Gateway design (rate limit + auth + concurrency queue)
+---
+## Phase 6 — Implementation
+**Est. time: 2–3 hrs | Diagrams needed: Directory tree (#9 class diagram, #10 activity diagram)**
+> **Reuse heavily from:** `CODEBASE_DOCUMENTATION.md` Section 5 and Section 8
+- [ ] 6.1 Directory Structure (already in CODEBASE_DOCUMENTATION.md Section 8)
+- [ ] 6.2 Module Descriptions (already in CODEBASE_DOCUMENTATION.md Section 5)
+- [ ] 6.3 Key Code Snippets *(do NOT paste full files — only algorithm excerpts)*
+  - [ ] Filter construction logic (`_build_filter` method)
+  - [ ] RRF scoring loop
+  - [ ] Intent-aware JSON chunking (`handle_json_docs`)
+  - [ ] Classifier prediction + threshold gating
+- [ ] 6.4 Configuration
+  - [ ] `.env` variables table (already in CODEBASE_DOCUMENTATION.md Section 5)
+  - [ ] Hyperparameter table (BM25 weights, thresholds, chunk size)
+- [ ] 6.5 Express Gateway Implementation
+  - [ ] Rate limiting configuration
+  - [ ] JWT auth middleware snippet
+  - [ ] Concurrency queue (`p-limit`) snippet
+---
+## Phase 7 — Testing & Evaluation
+**Est. time: 3–4 hrs | Diagrams needed: #11 (MRR bar chart), #12 (noise chart), #13 (confusion matrix)**
+> ⚠️ PLACEHOLDER — fill real numbers and screenshots AFTER deployment
+- [ ] 7.1 Test Plan
+  - [ ] Unit tests: Classifier accuracy per field (run `/test_classifier_dataset`)
+  - [ ] Integration tests: End-to-end hybrid query
+  - [ ] Performance: Measure average latency (cloud vs local)
+- [ ] 7.2 Results
+  - [ ] Comparison Table: Traditional pure-vector RAG vs Your Hybrid RAG
+    - Metrics: MRR, Hit Rate, Top-1 Hit Rate, Noise Rate, Latency
+  - [ ] MRR Bar Chart by query intent type *(Diagram #11)*
+  - [ ] Noise Rate comparison *(Diagram #12)*
+  - [ ] Classifier Confusion Matrix per field *(Diagram #13)*
+- [ ] 7.3 Sample Query Demonstrations
+  - Choose 3–5 representative queries, show:
+    - Input question
+    - Classifier output (type, category, topic, intent + confidences)
+    - Retrieved chunks with scores
+    - Final LLM answer
+---
+## Phase 8 — Deployment
+**Est. time: 1–2 hrs | Diagrams needed: Deployment diagram (#14)**
+> ⚠️ PLACEHOLDER — fill AFTER actual deployment
+- [ ] 8.1 System Requirements
+  - Hardware: 8GB RAM, 4-core CPU (local LLM) OR Google API key (Gemini)
+  - Software: Python 3.9+, Node.js 18+, ChromaDB
+- [ ] 8.2 Deployment Architecture *(Diagram #14)*
+  - Frontend → Express Gateway → FastAPI → ChromaDB
+- [ ] 8.3 Installation Steps
+  - Clone → `pip install -r requirements.txt` → Set `.env` → Run ingestion → Start API
+  - Express: `npm install` → Set `.env` → `node server.js`
+- [ ] 8.4 Screenshots *(fill after deployment)*
+  - [ ] Swagger UI (`/docs`)
+  - [ ] Sample chatbot interaction
+  - [ ] Admin panel
+  - [ ] Classification test panel
+---
+## Phase 9 — Future Scope & Conclusion
+**Est. time: 1–2 hrs | Diagrams needed: Roadmap (#15)**
+- [ ] 9.1 Future Enhancements
+  - Dynamic LLM switching via admin UI (ModelManager architecture)
+  - Cross-encoder re-ranking step (after resource becomes available)
+  - Query result caching layer
+  - Automated metadata prediction during ingestion (classifier-assisted)
+  - Website scraping for real-time data updates
+- [ ] 9.2 Known Limitations (already in CODEBASE_DOCUMENTATION.md Section 7)
+  - Local LLM latency (CPU-bound, no GPU)
+  - BM25 corpus rebuilt per request
+  - No real-time data — static knowledge base
+- [ ] 9.3 Conclusion
+  - Successfully built domain-specific RAG with hybrid retrieval
+  - Hierarchical classification reduces noise and improves precision
+  - Secure deployment with Express gateway protects the inference server
+---
+## Phase 10 — References & Appendices
+**Est. time: 1–2 hrs | No diagrams needed**
+- [ ] 10.1 References
+  - LangChain documentation
+  - ChromaDB documentation
+  - Original RRF paper (Cormack et al., 2009)
+  - Gemini API documentation
+  - VGEC official website (data source)
+  - BM25 (Robertson & Zaragoza, 2009)
+  - Sentence Transformers (Reimers & Gurevych, 2019)
+- [ ] 10.2 Appendix A — MASTER_INDEX full taxonomy
+- [ ] 10.3 Appendix B — Full API documentation (export from Swagger `/docs`)
+- [ ] 10.4 Appendix C — Sample classifier training data
+- [ ] 10.5 Appendix D — Sample department JSON format
+---
+## Execution Timeline
+| Phase | When | Priority |
+|---|---|---|
+| **All Diagrams** | Start NOW (before writing prose) | 🔴 Critical |
+| Phase 1–3 (Intro, Lit Review) | Day 1 | Must have |
+| Phase 4–5 (Design) | Day 2–3 | 🔴 Critical — most marks |
+| Phase 6 (Implementation) | Day 4 | Must have |
+| Phase 7 (Testing) | After deployment — Day 5 | 🔴 Critical — proof |
+| Phase 8 (Deployment) | After deployment | Must have |
+| Phase 9–10 (Future, Refs) | Day 6 | Finish strong |
+| Final PDF export + proofread | Last | Required |
+---
+## Reuse Map — What's Already Written
+| Documentation Section | Already in |
+|---|---|
+| System Architecture (components, data flow) | `CODEBASE_DOCUMENTATION.md` Section 2 |
+| Tech Stack Table | `CODEBASE_DOCUMENTATION.md` Section 1 |
+| Metadata Schema / Taxonomy | `CODEBASE_DOCUMENTATION.md` Section 3 |
+| Retrieval Pipeline steps | `CODEBASE_DOCUMENTATION.md` Section 4 |
+| All class/method descriptions | `CODEBASE_DOCUMENTATION.md` Section 5 |
+| Metrics definitions | `CODEBASE_DOCUMENTATION.md` Section 6 |
+| Known Limitations | `CODEBASE_DOCUMENTATION.md` Section 7 |
+| File Structure Tree | `CODEBASE_DOCUMENTATION.md` Section 8 |

Dockerfile ADDED Viewed

	@@ -0,0 +1,66 @@

+# ─────────────────────────────────────────────────────────────────────────────
+# VGEC RAG Chatbot — Dockerfile for Hugging Face Spaces
+# ─────────────────────────────────────────────────────────────────────────────
+# HF Spaces requirements:
+#   • Port MUST be 7860
+#   • GOOGLE_API_KEY must be set as a Space Secret in HF UI
+# ─────────────────────────────────────────────────────────────────────────────
+FROM python:3.11-slim
+# ── System dependencies ───────────────────────────────────────────────────────
+# build-essential  → needed by chromadb (hnswlib C extension)
+# libgomp1         → needed by sentence-transformers / scikit-learn OpenMP
+# git              → needed by some pip packages that install from git
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    libgomp1 \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# ── Working directory ─────────────────────────────────────────────────────────
+WORKDIR /app
+# ── Python dependencies ───────────────────────────────────────────────────────
+# Copy requirements first so Docker caches this layer separately from source code.
+# Any requirements change rebuilds from here; source code changes don't.
+COPY requirements.txt .
+# Install CPU-only PyTorch FIRST (prevents pip from pulling 2+ GB GPU wheels
+# when sentence-transformers later requests torch as a dependency).
+RUN pip install --no-cache-dir \
+    torch==2.2.2 \
+    --index-url https://download.pytorch.org/whl/cpu
+# Install the rest of the requirements.
+# llama-cpp-python is intentionally excluded — Gemini-only deployment.
+RUN pip install --no-cache-dir -r requirements.txt
+# Download the spaCy English model at build time so it's baked into the image.
+RUN python -m spacy download en_core_web_sm
+# ── Application source ────────────────────────────────────────────────────────
+COPY . .
+# ── Environment variables ─────────────────────────────────────────────────────
+# Tell Python not to buffer stdout/stderr (so logs appear in real time on HF).
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+# LLM mode — overrides the config.py default; HF Spaces will use Gemini API.
+# GOOGLE_API_KEY is NOT set here — it must be added as a HF Space Secret.
+ENV LLM_PROVIDER=gemini
+ENV ENABLE_FALLBACK=false
+# Point sentence-transformers cache inside /app so it's predictable.
+ENV SENTENCE_TRANSFORMERS_HOME=/app/ml_models/embeddings
+ENV HF_HOME=/app/.cache/huggingface
+# ── Port ──────────────────────────────────────────────────────────────────────
+# HF Spaces requires exactly port 7860.
+EXPOSE 7860
+# ── Startup ───────────────────────────────────────────────────────────────────
+# No --reload (dev-only flag).
+# --workers 1 keeps RAM usage predictable on the free tier (2 vCPU, 16 GB RAM).
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

LOCAL_MODEL_TRUNCATION_FIX.md ADDED Viewed

	@@ -0,0 +1,136 @@

+# Local Model Truncation Fix
+## 🐛 Problem
+The local model was cutting off responses mid-sentence, like:
+```
+"...applications for various purposes such as"
+```
+## 🔍 Root Cause
+The `ChatLlamaCpp` model configuration was **missing the `max_tokens` parameter**.
+Without this parameter:
+- The model defaults to a very low token generation limit
+- Responses get truncated mid-sentence
+- No warning or error is shown
+## ✅ Solution Applied
+### 1. Added `max_tokens` to Local Model Configuration
+**File:** `app/utils/model_factory.py`
+**Before:**
+```python
+model = ChatLlamaCpp(
+    model_path=str(model_file),
+    n_ctx=4096,
+    n_batch=512,
+    n_threads=4,
+    temperature=0.05,
+    # ❌ Missing max_tokens!
+)
+```
+**After:**
+```python
+model = ChatLlamaCpp(
+    model_path=str(model_file),
+    n_ctx=4096,
+    n_batch=512,
+    n_threads=4,
+    max_tokens=settings.local_max_tokens,  # ✅ FIXED!
+    temperature=0.05,
+)
+```
+### 2. Increased Gemini Token Limit
+**Before:** `max_output_tokens=512` (too low)
+**After:** `max_output_tokens=settings.max_output_tokens` (2048)
+### 3. Made Settings Configurable
+**File:** `app/core/config.py`
+Added:
+```python
+# Generation Settings
+max_output_tokens: int = 2048  # Max tokens for Gemini responses
+local_max_tokens: int = 2048   # Max tokens for local model responses
+```
+## 📊 Impact
+### Before:
+- **Gemini**: 512 max tokens (~350-400 words)
+- **Local**: Unknown (probably ~100-200 tokens)
+- **Result**: Truncated responses
+### After:
+- **Gemini**: 2048 max tokens (~1400-1500 words)
+- **Local**: 2048 max tokens (~1400-1500 words)
+- **Result**: Complete, full responses ✅
+## 🎯 Expected Behavior Now
+1. **Local model should complete sentences** instead of cutting off
+2. **Responses can be up to ~1500 words** before hitting the limit
+3. **Both models have equal response length capacity**
+## ⚙️ How to Adjust
+If you want even longer responses, edit `app/core/config.py`:
+```python
+# For longer responses (up to ~3500 words)
+max_output_tokens: int = 4096
+local_max_tokens: int = 4096
+# For shorter responses (to save processing time)
+max_output_tokens: int = 1024
+local_max_tokens: int = 1024
+```
+## 🧪 Test It
+Try asking the same question again. The local model should now:
+1. ✅ Complete full sentences
+2. ✅ Provide detailed answers
+3. ✅ Not cut off mid-word
+## 📝 Additional Notes
+### Why 2048 tokens?
+- Good balance between completeness and speed
+- Covers most Q&A scenarios
+- Prevents overly long responses
+### What is a "token"?
+- A token ≈ 0.75 words on average
+- 2048 tokens ≈ 1500 words
+- 4096 tokens ≈ 3000 words
+### Parameters Explained:
+- `n_ctx=4096`: Total context window (input + output)
+- `max_tokens=2048`: Maximum output only
+- This means: max ~2048 input + 2048 output = 4096 total
+### Other Fixes Applied:
+- Added comments to all parameters for clarity
+- Made token limits configurable via settings
+- Ensured both models have consistent behavior

MARKDOWN_FIX_SUMMARY.md ADDED Viewed

	@@ -0,0 +1,118 @@

+# Markdown Splitting Fix - Summary
+## Problem
+The markdown files with `---` section delimiters were being split at every `#` header, creating many small chunks with insufficient context.
+### Example Issue:
+```
+# Faculty of the Information & Communication Technology Department
+```
+This header alone was becoming a separate chunk because the default markdown splitter splits on headers.
+## Solution Implemented
+### 1. Created New Splitter Method: `for_markdown_with_sections()`
+**Location:** `app/services/text_splitter.py`
+**Custom Separators Priority:**
+1. `\n---\n` - Section delimiters (HIGHEST PRIORITY)
+2. `\n\n\n` - Triple newlines
+3. `\n\n` - Paragraphs
+4. `\n` - Single newlines
+5. `. ` - Sentences
+6. ` ` - Words
+7. `` - Characters (last resort)
+This ensures sections stay together and headers aren't split separately.
+### 2. Updated RAG Service
+**Location:** `app/services/rag_service.py` (line 77-82)
+**Changed from:**
+```python
+markdown_splitter = self.text_splitter.for_markdown(
+    chunk_size=chunk_size,
+    chunk_overlap=chunk_overlap
+)
+```
+**Changed to:**
+```python
+markdown_splitter = TextSplitter.for_markdown_with_sections(
+    chunk_size=chunk_size,
+    chunk_overlap=chunk_overlap
+)
+```
+### 3. Updated Document Helpers
+**Location:** `app/utils/document_helpers.py` (line 161-167)
+Added auto-detection for markdown with sections:
+```python
+# Use section-aware splitter if text contains markdown section delimiters
+if "\n---\n" in text or text.startswith("---\n"):
+    splitter = TextSplitter.for_markdown_with_sections()
+else:
+    splitter = TextSplitter()
+```
+## Expected Results
+### Before (with `for_markdown()`):
+- **Many small chunks** - Headers split separately
+- Example: "# Faculty..." becomes its own 50-character chunk
+- Poor context for RAG retrieval
+### After (with `for_markdown_with_sections()`):
+- **Fewer, more meaningful chunks** - Sections kept together
+- Headers stay with their content
+- Better context for RAG retrieval
+- Reduced number of chunks overall
+## How to Use
+### For File Upload (Already Applied):
+When you upload a `.md` file via the POST endpoint, it will automatically:
+1. Detect it's a markdown file
+2. Use `for_markdown_with_sections()` splitter
+3. Keep sections together
+### For Raw Text Upload:
+When posting raw text with `---` delimiters:
+1. The system auto-detects section delimiters
+2. Applies the section-aware splitter
+3. Preserves semantic structure
+## Configuration
+You can still adjust chunk size in `app/core/config.py`:
+```python
+chunk_size: int = 768  # Adjust as needed
+chunk_overlap: int = 200  # Adjust overlap
+```
+## Next Steps
+Try uploading your markdown file again. You should see:
+- ✅ Fewer total chunks
+- ✅ Each chunk contains header + related content
+- ✅ Better semantic coherence
+- ✅ Improved RAG retrieval quality

README.md CHANGED Viewed

@@ -1,10 +1,65 @@
----
-title: Vgecbot
-emoji: 🦀
-colorFrom: pink
-colorTo: red
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# RAG (Retrieval-Augmented Generation) Project
+## Services
+### Available Services
+1. **Document Loader** (`services.document_loader`)
+   - Load PDF documents
+   - Support for single and multiple file loading
+   - Lazy loading support
+2. **Vector Store** (`services.VectorStore`)
+   - Similarity search
+   - Document management (add, update, delete)
+   - Metadata filtering
+3. **Text Splitter** (`services.TextSplitter`) ✅
+   - Recursive character text splitting
+   - Language-specific splitting (20+ languages)
+   - See [docs/TEXT_SPLITTER.md](docs/TEXT_SPLITTER.md) for full documentation
+4. **RAG Service** (`services.RAGService`) ✅ **NEW**
+   - Integrates Document Loader, Text Splitter, Vector Store
+   - Powered by **Google Gemini** LLM
+   - Creates a complete RAG pipeline with retrieval & generation
+## Quick Start
+```python
+from services import document_loader, TextSplitter, VectorStore
+from libs import ROOT_PATH
+# Load documents
+pdf_path = ROOT_PATH / "document.pdf"
+doc_obj = document_loader(filepath=pdf_path)
+documents = doc_obj.load()
+# Split into chunks
+splitter = TextSplitter(chunk_size=1000, chunk_overlap=200)
+chunks = splitter.split_documents(documents)
+# Add to vector store
+# vector_store.add_documents(chunks)
+```
+## Examples
+Run the TextSplitter examples:
+```bash
+python examples_text_splitter.py
+```
+## Tasks
+- [x] Document Loader
+- [ ] Multiple PDF loader
+- [ ] if txt then txt loader
+- [ ] preprocessing
+  - [ ] stop_words removal
+  - [ ] punctuations
+  - [ ] lowercasing
+  - [ ] lemmetization
+- [x] Recursive TextSplitter ✅
+- [ ] Assign Them Metadata properly!

WHY_LOCAL_NOT_WORKING.md ADDED Viewed

	@@ -0,0 +1,112 @@

+# Why Local Model Isn't Working - Diagnosis
+## 🐛 Problems Found:
+### 1. **LRU Cache Keeps Old Model** (PRIMARY ISSUE)
+**File:** `app/api/dependencies.py` (line 13)
+```python
+@lru_cache()  # ❌ This caches the RAG service FOREVER
+def get_rag_service() -> RAGService:
+    llm_model = get_llm_model()  # Model initialized ONCE
+    ...
+```
+**Impact:**
+- Model is loaded when server FIRST starts
+- Even if you change config, the OLD model stays in memory
+- `@lru_cache()` never clears until server is fully restarted
+- Auto-reload doesn't clear the cache!
+### 2. **Missing Fallback Trigger**
+**File:** `app/utils/model_factory.py`
+The fallback logic EXISTS but it's not being triggered because:
+- The Gemini model initialization happens at startup (cached)
+- The error happens during model.invoke() (at query time)
+- But fallback only works during get_llm_model() (at init time)
+### 3. **Missing max_output_tokens** (FIXED)
+You deleted it from config.py which caused AttributeError.
+✅ I restored it.
+## ✅ Solutions:
+### **Quick Fix: Full Server Restart**
+Stop the server completely (Ctrl+C) and start it again:
+```bash
+# Kill the server
+Ctrl + C
+# Restart
+uvicorn main:app --reload
+```
+This will clear the LRU cache and load the local model.
+### **Permanent Fix: Remove or Fix LRU Cache**
+You have 2 options:
+#### Option A: Remove LRU Cache (Simplest)
+Models will be reinitialized on each request (slightly slower but settings-aware):
+```python
+# Remove @lru_cache()
+def get_rag_service() -> RAGService:
+    logger.info("Initializing RAG service...")
+    llm_model = get_llm_model()
+    ...
+```
+#### Option B: Make Cache Settings-Aware
+Cache based on current settings:
+```python
+def get_rag_service_key():
+    return (settings.llm_provider, settings.gemini_model_name, settings.local_model_name)
+@lru_cache(maxsize=2)
+def _cached_llm_model(provider, gemini_name, local_name):
+    return get_llm_model(provider)
+def get_rag_service() -> RAGService:
+    key = get_rag_service_key()
+    llm_model = _cached_llm_model(*key)
+    ...
+```
+### **Better Fix: Dynamic Model Loading**
+Make the RAG service check settings on each request and switch models if needed.
+## 📋 Action Items:
+1. ✅ **Fixed:** Restored `max_output_tokens` in config.py
+2. ⚠️ **TODO:** Full server restart (Ctrl+C then restart)
+3. ⚠️ **TODO:** Test with local model
+4. ⚠️ **TODO:** Consider removing `@lru_cache()` from dependencies.py
+## What's Happening Now:
+Right now, your server has:
+- ✅ config.py says `llm_provider = "local"`
+- ✅ max_output_tokens restored
+- ❌ BUT old Gemini model still in memory (cached)
+- ❌ Fallback can't help because model is already loaded
+**The cached Gemini model is still being used for all requests!**
+## 🎯 Next Step:
+**RESTART THE SERVER** (full stop + start, not just reload)

app/__init__.py ADDED Viewed

File without changes

app/api/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# We removed the top-level imports of routes here to prevent circular dependencies.
+# This file now only provides the base structure if needed.
+# If you want to use the api_router elsewhere, import it and register routes
+# in the file where you initialize the FastAPI app (main.py).

app/api/dependencies.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+FastAPI dependencies for dependency injection.
+"""
+from functools import lru_cache
+# from app.services.rag_service import RAGService  # MOVED INSIDE FUNCTION TO PREVENT CIRCULAR IMPORT
+from app.utils.model_factory import get_llm_model, get_embedding_model, get_local_model
+from app.core.config import settings
+import logging
+logger = logging.getLogger(__name__)
+@lru_cache()
+def get_rag_service():
+    from app.services.rag_service import RAGService
+    """
+    Get RAG service instance (singleton).
+    This is cached so the same instance is reused across requests.
+    Models are initialized once and reused.
+    Returns:
+        RAGService: Configured RAG service
+    """
+    logger.info("Initializing RAG service...")
+    # Initialize models
+    llm_model = get_llm_model()
+    embedding_model = get_embedding_model()
+    # Create RAG service
+    rag_service = RAGService(
+        model=llm_model,
+        collection_name=settings.collection_name,
+        persist_directory=settings.persist_directory,
+        embedding_model=embedding_model,
+        k=settings.similarity_top_k
+    )
+    logger.info("RAG service initialized successfully")
+    return rag_service

app/api/routes/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .rag import router as rag_router
+from .vector_store import router as vector_router
+from .settings import router as settings_router
+__all__ = ["rag_router", "vector_router", "settings_router"]

app/api/routes/rag.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from fastapi import APIRouter, Depends, HTTPException
+from app.api.schemas.requests import RAGRequest
+from app.api.dependencies import get_rag_service
+from app.api.schemas.tests import TestRequestSchema, TestClassifierReqSchema
+from app.services.rag_service import RAGService
+from app.utils.tests import classifier_test_dataset
+import logging
+logger = logging.getLogger(__name__)
+router = APIRouter()
+@router.get("/")
+def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy", "service": "RAG"}
+@router.post("/")
+def query_rag(
+    request: RAGRequest,
+    rag_service: RAGService = Depends(get_rag_service)  # ✅ Dependency injection!
+):
+    """
+    Query the RAG system with a question.
+    Args:
+        request: RAG request with question and history
+        rag_service: Injected RAG service instance
+    Returns:
+        Answer with references and metadata
+    """
+    try:
+        response = rag_service.query(
+            question=request.question,
+            history=request.history or [],
+            k=request.k,
+            threshold=request.threshold,
+            include_llm_response=request.include_llm_response
+        )
+        return response
+    except Exception as e:
+        logger.error(f"RAG query failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/hybrid_query")
+def hybrid_query(
+    request: RAGRequest,
+    rag_service: RAGService = Depends(get_rag_service)  # ✅ Dependency injection!
+):
+    """
+    Query the RAG system with a question.
+    Args:
+        request: RAG request with question and history
+        rag_service: Injected RAG service instance
+    Returns:
+        Answer with references and metadata
+    """
+    try:
+        response = rag_service.hybrid_query(
+            question=request.question,
+            history=request.history or [],
+            k=request.k,
+            threshold=request.threshold,
+            include_llm_response=request.include_llm_response
+        )
+        return response
+    except Exception as e:
+        logger.error(f"RAG query failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/test")
+def test_queries(
+    request: TestRequestSchema,
+    query_delay: float = 1.0,           # seconds between queries (Gemini 100 RPM limit)
+    rag_service: RAGService = Depends(get_rag_service)
+):
+    """
+    Run batch retrieval evaluation.
+    - query_delay: sleep between queries to respect Gemini embedding rate limit.
+      Free tier = 100 RPM → 1.0s delay safe for up to 150 queries (~2.5 min).
+      Set to 0.0 to disable (only if you have a paid API key).
+    """
+    try:
+        response = rag_service.test_queries(
+            tests=request,
+            query_delay=query_delay
+        )
+        return response
+    except Exception as e:
+        logger.error(f"Test Execution failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/test_classifier")
+def test_classifier(
+    request: TestClassifierReqSchema,
+    rag_service: RAGService = Depends(get_rag_service)
+):
+    try:
+        if(request.tests is None):
+            raise HTTPException(status_code=400, detail="No tests provided")
+        response = rag_service.test_classifier(
+            tests=request
+        )
+        return response
+    except Exception as e:
+        logger.error(f"Test classifier Execution failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/test_classifier_dataset")
+def test_classifier_dataset(
+    rag_service: RAGService = Depends(get_rag_service)
+):
+    try:
+        if(len(rag_service.evaluation.keys()) > 0):
+            return rag_service.evaluation
+        req = TestClassifierReqSchema(tests=classifier_test_dataset)
+        response = rag_service.test_classifier(
+            tests=req
+        )
+        rag_service.evaluation = response["evaluation"]
+        return rag_service.evaluation
+    except Exception as e:
+        logger.error(f"Test classifier Execution failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/similarity_search")
+def similarity_search(
+    request: RAGRequest,
+    rag_service: RAGService = Depends(get_rag_service)  # ✅ Dependency injection!
+):
+    """
+    Query the RAG system with a question.
+    Args:
+        request: RAG request with question and history
+        rag_service: Injected RAG service instance
+    Returns:
+        Answer with references and metadata
+    """
+    try:
+        response = rag_service.hybrid_query(
+            question=request.question,
+            history=request.history or [],
+            k=request.k,
+            threshold=request.threshold,
+            include_llm_response=False,
+        )
+        return response
+    except Exception as e:
+        logger.error(f"RAG query failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/search")
+def search(
+    request: RAGRequest,
+    rag_service: RAGService = Depends(get_rag_service)  # ✅ Dependency injection!
+):
+    """
+    Query the RAG system with a question.
+    Args:
+        request: RAG request with question and history
+        rag_service: Injected RAG service instance
+    Returns:
+        Answer with references and metadata
+    """
+    try:
+        response = rag_service.search_docs(
+            question=request.question,
+            k=request.k
+        )
+        return response
+    except Exception as e:
+        logger.error(f"RAG query failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

app/api/routes/settings.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from fastapi import APIRouter, HTTPException, status
+from app.core.config import settings
+from app.api.schemas.settings import SettingsUpdate, SettingsResponse
+import logging
+logger = logging.getLogger(__name__)
+router = APIRouter()
+@router.get("/", response_model=SettingsResponse)
+def get_settings():
+    """
+    Get current application settings.
+    Returns all configurable settings including RAG parameters,
+    model configuration, and API settings.
+    """
+    return SettingsResponse(
+        # Paths (read-only)
+        root_path=str(settings.root_path),
+        model_path=str(settings.model_path),
+        data_path=str(settings.data_path),
+        # API Settings
+        api_title=settings.api_title,
+        api_version=settings.api_version,
+        cors_origins=settings.cors_origins,
+        # RAG Settings
+        chunk_size=settings.chunk_size,
+        chunk_overlap=settings.chunk_overlap,
+        similarity_top_k=settings.similarity_top_k,
+        similarity_threshold=settings.similarity_threshold,
+        collection_name=settings.collection_name,
+        persist_directory=settings.persist_directory,
+        # Model Settings
+        llm_provider=settings.llm_provider,
+        enable_fallback=settings.enable_fallback,
+        embedding_model_name=settings.embedding_model_name,
+        gemini_model_name=settings.gemini_model_name,
+        local_model_name=settings.local_model_name,
+    )
+@router.patch("/", response_model=SettingsResponse)
+def update_settings(updates: SettingsUpdate):
+    """
+    Update application settings at runtime.
+    Only provided fields will be updated. Omitted fields remain unchanged.
+    **Note:** Changes are runtime-only and will be lost on server restart.
+    To persist changes, update the `.env` file.
+    **Warning:** Some changes (like CORS origins) may require server restart
+    to take full effect.
+    """
+    updated_fields = []
+    # Update RAG settings
+    if updates.chunk_size is not None:
+        settings.chunk_size = updates.chunk_size
+        updated_fields.append("chunk_size")
+    if updates.chunk_overlap is not None:
+        settings.chunk_overlap = updates.chunk_overlap
+        updated_fields.append("chunk_overlap")
+    if updates.similarity_top_k is not None:
+        settings.similarity_top_k = updates.similarity_top_k
+        updated_fields.append("similarity_top_k")
+    if updates.similarity_threshold is not None:
+        settings.similarity_threshold = updates.similarity_threshold
+        updated_fields.append("similarity_threshold")
+    # Update Model settings
+    if updates.llm_provider is not None:
+        settings.llm_provider = updates.llm_provider
+        updated_fields.append("llm_provider")
+        logger.info(f"LLM provider changed to: {updates.llm_provider}")
+    if updates.enable_fallback is not None:
+        settings.enable_fallback = updates.enable_fallback
+        updated_fields.append("enable_fallback")
+    if updates.gemini_model_name is not None:
+        settings.gemini_model_name = updates.gemini_model_name
+        updated_fields.append("gemini_model_name")
+    if updates.local_model_name is not None:
+        settings.local_model_name = updates.local_model_name
+        updated_fields.append("local_model_name")
+    # Update API settings
+    if updates.cors_origins is not None:
+        settings.cors_origins = updates.cors_origins
+        updated_fields.append("cors_origins")
+        logger.warning("CORS origins updated. Server restart may be required for full effect.")
+    logger.info(f"Settings updated: {', '.join(updated_fields)}")
+    # Return updated settings
+    return get_settings()
+@router.post("/reset")
+def reset_settings():
+    """
+    Reset all settings to defaults from .env file.
+    This reloads settings from the environment file and discards
+    any runtime changes.
+    **Warning:** This will restart the settings object and may cause
+    temporary service interruption.
+    """
+    try:
+        # Reload settings from .env
+        from app.core.config import Settings
+        new_settings = Settings()
+        # Update the global settings object
+        for key, value in new_settings.dict().items():
+            setattr(settings, key, value)
+        logger.info("Settings reset to defaults from .env")
+        return {
+            "message": "Settings reset to defaults",
+            "status": "success"
+        }
+    except Exception as e:
+        logger.error(f"Failed to reset settings: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to reset settings: {str(e)}"
+        )
+@router.get("/rag")
+def get_rag_settings():
+    """
+    Get only RAG-related settings.
+    Returns chunk sizes, similarity parameters, and vector store configuration.
+    """
+    return {
+        "chunk_size": settings.chunk_size,
+        "chunk_overlap": settings.chunk_overlap,
+        "similarity_top_k": settings.similarity_top_k,
+        "similarity_threshold": settings.similarity_threshold,
+        "collection_name": settings.collection_name,
+        "persist_directory": settings.persist_directory,
+    }
+@router.get("/models")
+def get_model_settings():
+    """
+    Get only model-related settings.
+    Returns LLM provider, model names, and fallback configuration.
+    """
+    return {
+        "llm_provider": settings.llm_provider,
+        "enable_fallback": settings.enable_fallback,
+        "embedding_model_name": settings.embedding_model_name,
+        "gemini_model_name": settings.gemini_model_name,
+        "local_model_name": settings.local_model_name,
+    }
+@router.get("/api")
+def get_api_settings():
+    """
+    Get only API-related settings.
+    Returns API metadata and CORS configuration.
+    """
+    return {
+        "api_title": settings.api_title,
+        "api_version": settings.api_version,
+        "cors_origins": settings.cors_origins,
+    }

app/api/routes/vector_store.py ADDED Viewed

	@@ -0,0 +1,311 @@

+from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
+from langchain_core.documents import Document
+from app.api.dependencies import get_rag_service
+from app.core.config import settings
+from app.api.schemas.requests import (
+    deleteDocs,
+    DocumentType,
+    PaginationParams,
+    DocumentFilters,
+    DeleteFilters,
+)
+from fastapi.responses import JSONResponse
+from pathlib import Path
+from typing import Optional, List, Dict, Any
+from fastapi import Path as Params
+from app.services.rag_service import RAGService
+from app.services.ingestion_service import ingestion_service
+import os
+import shutil
+import math
+# Temp directory for uploaded files before ingestion
+UPLOAD_TEMP_PATH = settings.root_path / "temp"
+router = APIRouter()
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+def _unpack_chroma_result(result: dict):
+    """Unpack a raw ChromaDB result dict into parallel lists."""
+    ids   = result.get("ids", [])
+    docs  = result.get("documents", [])
+    metas = result.get("metadatas", [])
+    return ids, docs, metas
+def _apply_doc_filters(
+    ids: List[str],
+    docs: List[str],
+    metas: List[Dict[str, Any]],
+    filters: DocumentFilters,
+) -> List[Dict]:
+    """Filter a Chroma result set by DocumentFilters and return shaped dicts."""
+    filtered = []
+    for i in range(len(ids)):
+        doc_text = docs[i]
+        meta = metas[i] if metas else {}
+        if filters.filename and meta.get("source_file") != filters.filename:
+            continue
+        if filters.source and meta.get("source") != filters.source:
+            continue
+        if filters.contains and filters.contains.lower() not in doc_text.lower():
+            continue
+        filtered.append({"id": ids[i], "content": doc_text, "metadata": meta})
+    return filtered
+def _save_upload(file: UploadFile) -> Path:
+    """Save an uploaded file to the temp directory and return its path."""
+    UPLOAD_TEMP_PATH.mkdir(parents=True, exist_ok=True)
+    file_path = UPLOAD_TEMP_PATH / file.filename
+    with open(file_path, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    return file_path
+# ---------------------------------------------------------------------------
+# GET /filenames  ← must be before GET /{id} to avoid route conflict
+# ---------------------------------------------------------------------------
+@router.get("/filenames")
+def list_filenames(rag_service: RAGService = Depends(get_rag_service)):
+    """Return a list of unique ingested filenames."""
+    return rag_service.get_filenames()
+# ---------------------------------------------------------------------------
+# GET /
+# ---------------------------------------------------------------------------
+@router.get("/")
+def list_documents(
+    params: PaginationParams = Depends(),
+    filters: DocumentFilters = Depends(),
+    rag_service: RAGService = Depends(get_rag_service),
+):
+    """
+    List all documents with pagination and optional filters.
+    Query Parameters:
+    - page: Page number (default: 1)
+    - limit: Items per page (default: 10, max: 100)
+    - order: Sort order – "asc" or "desc" (default: "desc")
+    - filename: Filter by source_file metadata
+    - source: Filter by source metadata path
+    - contains: Filter by text content (case-insensitive)
+    """
+    ids, docs, metas = [], [], []
+    if filters.contains:
+        documents = rag_service.search_docs(
+            question=filters.contains,
+            k=params.limit,
+            filename = filters.filename
+        )
+        filtered = documents
+        print(filtered)
+    else:
+        ids, docs, metas = _unpack_chroma_result(rag_service.database.get())
+        filtered = _apply_doc_filters(ids, docs, metas, filters)
+    # Sort by creation date
+    reverse = params.order == "desc"
+    filtered.sort(
+        key=lambda x: x.get("metadata", {}).get("creationdate", ""),
+        reverse=reverse,
+    )
+    # Paginate
+    total_docs = len(filtered)
+    total_pages = math.ceil(total_docs / params.limit) if total_docs > 0 else 0
+    start = (params.page - 1) * params.limit
+    paginated = filtered[start : start + params.limit]
+    return {
+        "page": params.page,
+        "limit": params.limit,
+        "total_docs": total_docs,
+        "total_pages": total_pages,
+        "order": params.order,
+        "data": paginated,
+        "status": 200,
+    }
+# ---------------------------------------------------------------------------
+# GET /{id}
+# ---------------------------------------------------------------------------
+@router.get("/{id}")
+def get_document(
+    id: str = Params(...),
+    rag_service: RAGService = Depends(get_rag_service),
+):
+    """Fetch a single document by its ChromaDB ID."""
+    if not id:
+        raise HTTPException(status_code=400, detail="Document ID is required")
+    result = rag_service.database.get_by_id(ids=[id])
+    ids, docs, metas = _unpack_chroma_result(result)
+    data = [
+        {"id": ids[i], "document": docs[i], "metadata": metas[i] if metas else {}}
+        for i in range(len(ids))
+    ]
+    return {"data": data, "status": 200}
+# ---------------------------------------------------------------------------
+# POST /  (file upload + ingestion)
+# ---------------------------------------------------------------------------
+SUPPORTED_EXTENSIONS = {".md", ".pdf", ".json", ".txt"}
+@router.post("/")
+def upload_document(
+    file: UploadFile = File(...),
+    title: Optional[str] = None,
+    rag_service: RAGService = Depends(get_rag_service),
+):
+    """
+    Upload and ingest a document file into the vector store.
+    Supported types: .md, .pdf, .json, .txt
+    """
+    file_path = _save_upload(file)
+    ext = file_path.suffix.lower()
+    if ext not in SUPPORTED_EXTENSIONS:
+        file_path.unlink(missing_ok=True)
+        raise HTTPException(status_code=400, detail=f"Unsupported file type: {ext}")
+    docs = rag_service.ingest_documents(file_path)
+    file_path.unlink(missing_ok=True)
+    if not docs:
+        raise HTTPException(status_code=400, detail="No content could be extracted from the file")
+    return JSONResponse({
+        "filename": file.filename,
+        "message": f"{ext} uploaded and ingested successfully",
+        "docs_added": len(docs),
+        "status": 200,
+    })
+# ---------------------------------------------------------------------------
+# PUT /{id}
+# ---------------------------------------------------------------------------
+@router.put("/{id}")
+def update_document(
+    doc: DocumentType,
+    id: str = Params(...),
+    rag_service: RAGService = Depends(get_rag_service),
+):
+    """Update an existing document's content and metadata by ID."""
+    if not id:
+        raise HTTPException(status_code=400, detail="Document ID is required")
+    content = doc.document.strip()
+    if not content:
+        raise HTTPException(status_code=400, detail="Document content cannot be empty")
+    updated_document = Document(
+        page_content=content,
+        metadata={**doc.metadata, "id": id},
+    )
+    rag_service.database.update_document(id, updated_document)
+    return {"status": 200, "message": f"{id} updated successfully"}
+# ---------------------------------------------------------------------------
+# DELETE /ids  (bulk delete by explicit ID list)
+# ---------------------------------------------------------------------------
+@router.delete("/ids")
+def delete_documents_by_ids(
+    body: deleteDocs,
+    rag_service: RAGService = Depends(get_rag_service),
+):
+    """Delete multiple documents by providing an explicit list of IDs."""
+    result = rag_service.database.delete(body.docs)
+    return {"message": "Documents deleted successfully", "deleted": len(body.docs), "result": result}
+# ---------------------------------------------------------------------------
+# DELETE /{id}  (single delete)
+# ---------------------------------------------------------------------------
+@router.delete("/{id}")
+def delete_document(
+    id: str = Params(...),
+    rag_service: RAGService = Depends(get_rag_service),
+):
+    """Delete a single document by its ChromaDB ID."""
+    if not id:
+        raise HTTPException(status_code=400, detail="Document ID is required")
+    result = rag_service.database.delete([id])
+    return {"message": "Document deleted successfully", "deleted": 1, "result": result, "status": 200}
+# ---------------------------------------------------------------------------
+# DELETE /  (filter-based delete)
+# ---------------------------------------------------------------------------
+@router.delete("/")
+def delete_documents_by_filter(
+    filters: DeleteFilters = Depends(),
+    rag_service: RAGService = Depends(get_rag_service),
+):
+    """
+    Delete documents matching filter criteria.
+    Query Parameters:
+    - filename: Delete documents with this source_file value
+    - source: Delete documents with this source path
+    - contains: Delete documents whose text contains this string
+    - dry_run: Preview matching docs without deleting (default: false)
+    """
+    ids, docs, metas = _unpack_chroma_result(rag_service.database.get())
+    delete_ids = []
+    for i in range(len(ids)):
+        doc_text = docs[i]
+        meta = metas[i] if metas else {}
+        if filters.source:
+            stored_source = str(Path(meta.get("source", "")).resolve())
+            input_source  = str(Path(filters.source).resolve())
+            if stored_source != input_source:
+                continue
+        if filters.filename and meta.get("source_file") != filters.filename:
+            continue
+        if filters.contains and filters.contains.lower() not in doc_text.lower():
+            continue
+        delete_ids.append(ids[i])
+    if filters.filename:
+        ingestion_service.delete_record(filters.filename)
+    if not delete_ids:
+        return {"message": "No matching documents found", "deleted": 0}
+    if filters.dry_run:
+        return {
+            "message": "Dry run – no documents deleted",
+            "matched_count": len(delete_ids),
+            "matched_ids": delete_ids,
+        }
+    result = rag_service.database.delete(delete_ids)
+    return {"message": "Documents deleted successfully", "deleted": len(delete_ids), "result": result}

app/api/schemas/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .tests import TestResponseSchema, TestRequestSchema

app/api/schemas/requests.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from pydantic import BaseModel, Field
+from typing import List, Optional, Literal, Annotated, Dict, Any
+from app.core.config import settings
+# ✅ Query Parameter Schemas - Clean and Reusable!
+class PaginationParams(BaseModel):
+    """Pagination query parameters for list endpoints."""
+    page: int = Field(default=1, ge=1, description="Page number (starts at 1)")
+    limit: int = Field(default=10, ge=1, le=100, description="Items per page (max 100)")
+    order: Literal["asc", "desc"] = Field(default="desc", description="Sort order")
+class DocumentFilters(BaseModel):
+    """Document filtering query parameters."""
+    filename: Optional[str] = Field(default=None, description="Filter by exact filename")
+    source: Optional[str] = Field(default=None, description="Filter by source path")
+    contains: Optional[str] = Field(default=None, description="Filter by text content (case-insensitive)")
+class DeleteFilters(BaseModel):
+    """Delete operation filters with dry-run support."""
+    filename: Optional[str] = Field(default=None, description="Delete documents with this filename")
+    source: Optional[str] = Field(default=None, description="Delete documents from this source")
+    contains: Optional[str] = Field(default=None, description="Delete documents containing this text")
+    dry_run: bool = Field(default=False, description="Preview deletions without executing")
+# Request Body Schemas
+class RAGRequest(BaseModel):
+    """Request schema for RAG query endpoint."""
+    question: Annotated[str, Field(min_length=1, description="Question that user wants to ask")]
+    history: Annotated[Optional[List[str]], Field(default=[], description="Previously Asked Questions")]
+    k: int = Field(default=settings.similarity_top_k, ge=1, le=100, description="Number of similar documents to retrieve")
+    threshold: float = Field(default=settings.similarity_threshold, ge=0, le=1, description="Similarity threshold")
+    include_llm_response: bool = Field(default=True, description="Whether to generate LLM answer")
+class Query(BaseModel):
+    """Query result schema."""
+    def __init__(self, question: str, answer: str):
+        self.question = question
+        self.answer = answer
+class deleteDocs(BaseModel):
+    """Request schema for bulk delete by IDs."""
+    docs: Annotated[List[str], Field(min_length=1, description="List of IDs that you want to delete!")]
+class DocumentType(BaseModel):
+    """Document update schema."""
+    id: str
+    metadata: Dict[str, Any]
+    document: str
+class SimilaritySearch(BaseModel):
+    """Search The Best Params for Similarity Search"""
+    query: str
+    k: int = Field(default=settings.similarity_top_k, ge=1, le=100, description="Number of similar documents to retrieve")
+    threshold: float = Field(default=settings.similarity_threshold, ge=0, le=1, description="Similarity threshold")
+    history: Annotated[Optional[List[str]], Field(default=[], description="Previously Asked Questions")]
+class TextIngestRequest(BaseModel):
+    """Request schema for raw text ingestion."""
+    text: str = Field(..., min_length=1, description="Raw text content to ingest")
+    metadata: Dict[str, Any] = Field(default_factory=dict, description="Optional metadata (title, source, etc.)")
+    filename: Optional[str] = Field(default=None, description="Virtual filename/source for the document")

app/api/schemas/settings.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from pydantic import BaseModel, Field
+from typing import List, Optional, Literal
+from app.core.config import settings
+class SettingsUpdate(BaseModel):
+    """Schema for updating application settings."""
+    # RAG Settings
+    chunk_size: Optional[int] = Field(default=settings.chunk_size, ge=100, le=5000, description="Text chunk size")
+    chunk_overlap: Optional[int] = Field(default=settings.chunk_overlap, ge=0, le=1000, description="Chunk overlap size")
+    similarity_top_k: Optional[int] = Field(default=settings.similarity_top_k, ge=1, le=20, description="Number of similar docs to retrieve")
+    similarity_threshold: Optional[float] = Field(default=settings.similarity_threshold, ge=0, le=1, description="Similarity threshold")
+    # Model Settings
+    llm_provider: Optional[Literal["gemini", "local"]] = Field(default=settings.llm_provider, description="LLM provider to use")
+    enable_fallback: Optional[bool] = Field(default=settings.enable_fallback, description="Enable fallback to alternate model")
+    gemini_model_name: Optional[str] = Field(default=settings.gemini_model_name, description="Gemini model name")
+    local_model_name: Optional[str] = Field(default=settings.local_model_name, description="Local model filename")
+    # API Settings
+    cors_origins: Optional[List[str]] = Field(default=settings.cors_origins, description="Allowed CORS origins")
+class SettingsResponse(BaseModel):
+    """Schema for settings response."""
+    # Paths (read-only)
+    root_path: str
+    model_path: str
+    data_path: str
+    # API Settings
+    api_title: str
+    api_version: str
+    cors_origins: List[str]
+    # RAG Settings
+    chunk_size: int
+    chunk_overlap: int
+    similarity_top_k: int
+    similarity_threshold: float
+    collection_name: str
+    persist_directory: str
+    # Model Settings
+    llm_provider: str
+    enable_fallback: bool
+    embedding_model_name: str
+    gemini_model_name: str
+    local_model_name: str
+    class Config:
+        from_attributes = True

app/api/schemas/tests.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from pydantic import BaseModel, Field
+from typing import List
+class Test(BaseModel):
+    question: str = Field(min_length=1, max_length=100, description="Question you want to test")
+    document: str = Field(min_length=1, description="Document name")
+    chunk_index: int = Field(default=0, min=0, description="Chunk index")
+class TestRequestSchema(BaseModel):
+    tests: List[Test] = Field(min_length=1, description="give tests to evalute")
+    k: int = Field(default=5, min=0, max=20, description="maximum number of results")
+    threshold: float = Field(default= 0.4, min=0.0, max=1.0, description="Threshold for reference")
+class TestResponse(BaseModel):
+    tests: Test
+    answer: bool
+class TestResponseSchema(BaseModel):
+    tests: List[TestResponse] = Field(min_length=1, description="test results")
+class TestClassifier(BaseModel):
+    question: str = Field(min_length=1, description="Question you want to test")
+    type: str = Field(min_length=1, description="Type to be predicted")
+    category: str = Field(min_length=1, description="Category to be predicted")
+    topic: str = Field(min_length=1, description="Topic to be predicted")
+    intent: str = Field(min_length=1, description="Intent to be predicted")
+class TestClassifierReqSchema(BaseModel):
+    tests: List[TestClassifier] = Field(min_length=1, description="give tests to evalute")

app/core/__init__.py ADDED Viewed

File without changes

app/core/config.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from pydantic_settings import BaseSettings
+from pathlib import Path
+from typing import Literal
+class Settings(BaseSettings):
+    # Paths
+    root_path: Path = Path(__file__).resolve().parents[2]
+    core_models_path: Path = root_path / "ml_models"
+    model_path: Path = core_models_path / "llm"
+    embeddings_path: Path = core_models_path / "embeddings"
+    data_path: Path = root_path / "data"
+    documents_path: Path = data_path / "documents"
+    vector_stores_path: Path = data_path / "vector_stores"
+    classifier_path: Path = core_models_path / "classifier"
+    # API Settings
+    api_title: str = "VGEC RAG Chatbot API"
+    api_version: str = "1.0.0"
+    cors_origins: list[str] = ["*"]
+    # RAG Settings
+    chunk_size: int = 500
+    chunk_overlap: int = 100
+    similarity_top_k: int = 8
+    similarity_threshold: float = 0.4  # ✅ NEW - Filter docs by similarity score
+    collection_name: str = "classifier_test_1"
+    persist_directory: str = str(vector_stores_path / collection_name)
+    # Model Selection - ✅ NEW!
+    llm_provider: Literal["gemini", "local"] = "gemini"  # Which model to use
+    enable_fallback: bool = False  # Fallback to local if Gemini fails
+    # Model Settings
+    embedding_model_name: str = "models/gemini-embedding-001"
+    gemini_model_name: str = "gemini-2.5-flash-lite"
+    local_model_name: str = "Qwen2.5-0.5B-Instruct-Q4_K_M.gguf"
+    # Llama-3.2-3B-Instruct-Q4_K_M.gguf
+    # query_model_name: str = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
+    # Qwen2.5-0.5B-Instruct-Q4_K_M.gguf
+    # Qwen3-0.6B-Q4_K_M.gguf
+    # Vi-Qwen2-1.5B-RAG.Q4_K_M.gguf
+    # Generation Settings
+    max_output_tokens: int = 2048  # Max tokens for Gemini responses
+    local_max_tokens: int = 512   # Max tokens for local model responses
+    # Google API - ✅ Pydantic automatically reads from .env
+    google_api_key: str  # No default = required field
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+settings = Settings()

app/core/paths.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from pathlib import Path
+ROOT_PATH = Path(__file__).resolve().parents[2]
+MODEL_PATH = ROOT_PATH / "ml_models"
+LIBS_PATH = ROOT_PATH / "libs"
+data_path = ROOT_PATH / "data"
+print(ROOT_PATH)
+print(MODEL_PATH)
+print(LIBS_PATH)

app/main.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from fastapi import FastAPI, APIRouter
+from fastapi.middleware.cors import CORSMiddleware
+from app.core.config import settings
+from app.api.routes import rag, vector_store, settings as settingsRouter
+app = FastAPI()
+# Include individual routers
+API_PREFIX = "/api/v1"
+app.include_router(rag.router, prefix=f"{API_PREFIX}/rag", tags=["RAG"])
+app.include_router(vector_store.router, prefix=f"{API_PREFIX}/vector", tags=["Vector Store"])
+app.include_router(settingsRouter.router, prefix=f"{API_PREFIX}/settings", tags=["Settings"])
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=settings.cors_origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)

app/models/__init__.py ADDED Viewed

File without changes

app/prompts/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .system_prompts import SYSTEM_PROMPT, QUESTION_WRITER_SYSTEM_PROMPT, wrap_exaone

app/prompts/system_prompts.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# SYSTEM_PROMPT = """
+# You are VGEC-Assistant, a polite and helpful information retrieval chatbot for Vishwakarma Government Engineering College (VGEC).
+# You MUST answer the user's question using ONLY the information inside the given CONTEXT. The CONTEXT is the only source of truth.
+# you have to help the users and guide them to answer based on the given context, dont guess but provide answer or guide them in any way you fit.
+# Output Format:
+# - Always respond in plain text as complete sentences.
+# - Do not add extra explanation or new facts.
+# - Keep responses concise and courteous.
+# - Do NOT use outside knowledge.
+# - Do NOT guess.
+# - Always respond in markdown format.
+# ---
+# HISTORY:
+# {history}
+# ---
+# CONTEXT:
+# {context}
+# ---
+# QUESTION:
+# {question}
+# ---
+# ANSWER:
+# """
+SYSTEM_PROMPT = """
+You are VGEC-Assistant, a helpful chatbot for Vishwakarma Government Engineering College (VGEC).
+Answer the user's question using ONLY the information in the given CONTEXT.
+If the answer can be logically inferred from the context, provide the answer clearly.
+If the answer is not present in the context, say: "Sorry, I couldn't find that in the provided information."
+Guidelines:
+- Keep the response short and clear.
+- Do not repeat the context.
+- Do not guess or make assumptions.
+- Answer in Markdown Format.
+---
+HISTORY:
+{history}
+---
+CONTEXT:
+{context}
+---
+QUESTION:
+{question}
+---
+ANSWER:
+"""
+def wrap_exaone(prompt):
+    return f"""[|system|]
+You are a helpful AI assistant. Answer only from the given context. If unsure, say "I don't know".
+[|endofturn|]
+[|user|]
+{prompt.strip()}
+[|endofturn|]
+[|assistant|]
+"""
+QUESTION_WRITER_SYSTEM_PROMPT = """You are a query rewriting assistant for Vishwakarma Government Engineering College (VGEC).
+STRICT RULES:
+1. Expand abbreviations using ONLY this mapping:
+   - IT = Information Technology Department
+   - ICT = Information and Communication Technology Department
+   - CE = Computer Engineering Department
+   - EC = Electronics and Communication Engineering Department
+   - IC = Instrumentation and Control Engineering Department
+   - PE = Power Electronics Department
+   - ME = Mechanical Engineering Department
+   - Civil = Civil Engineering Department
+   - CSE = Computer Science & Engineering (Data Science) Department
+   - DS = Computer Science & Engineering (Data Science) Department
+   - ACPC = Admission Committee for Professional Courses (administrative, NOT a department)
+   - STS = Student Section Portal (administrative, NOT a department)
+2. CRITICAL: If query has NO department abbreviation, do NOT add any department.
+3. Output ONLY the rewritten query. No quotes, no prefixes, no explanations.
+GOOD EXAMPLES:
+User: "ds fees?"
+Rewritten: What are the fees for the Computer Science & Engineering (Data Science) Department?
+User: "cse block?"
+Rewritten: Which block houses the Computer Science & Engineering (Data Science) Department?
+User: "fees"
+Rewritten: What are the fees?
+User: "admission"
+Rewritten: What is the admission process?
+User: "acpc registration"
+Rewritten: What is the ACPC registration process?
+BAD EXAMPLES (NEVER DO THIS):
+User: "fees"
+Bad: What are the fees for the Computer Science & Engineering (Data Science) Department?
+User: "placement"
+Bad: What are the placement statistics for the Mechanical Engineering Department?
+Query: {query}
+Rewritten:"""

app/services/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # This file is intentionally empty to prevent circular imports.
2	+ # Import services directly from their modules.

app/services/classifier_service.py ADDED Viewed

	@@ -0,0 +1,337 @@

+import re
+import numpy as np
+import pickle
+from sentence_transformers import SentenceTransformer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split, cross_val_score
+from app.core.config import settings
+from typing import List, Optional
+def load_pipeline(path):
+    with open(path, "rb") as f:
+        pipeline = pickle.load(f)
+    return pipeline
+class Classifier:
+    def __init__(
+        self,
+        tfidf,
+        abbreviations,
+        master_index,
+        le_type,
+        le_category,
+        le_topic,
+        le_intent,
+        models=None,
+        df=None,
+    ):
+        self.tfidf = tfidf
+        self.abbreviations = abbreviations
+        self.master_index = master_index
+        self.le_type = le_type
+        self.le_category = le_category
+        self.le_topic = le_topic
+        self.le_intent = le_intent
+        model_path = settings.embeddings_path / "mdbr-leaf-mt"
+        if model_path.exists():
+            self.embedding_model = SentenceTransformer(str(model_path))
+        else:
+            self.embedding_model = SentenceTransformer("MongoDB/mdbr-leaf-mt")
+        # Prediction thresholds: below these, the field is set to None entirely
+        self.threshold = {
+            "type": 0.4,
+            "category": 0.4,
+            "topic": 0.5,
+            "intent": 0.6
+        }
+        # Filter thresholds: above these, the field is used as a hard ChromaDB filter
+        # Kept separate so you can tune "when to predict" vs "when to filter" independently
+        self.filter_threshold = {
+            "type": 0.65,
+            "category": 0.65,
+            "topic": 0.70,
+        }
+        # If trained models are passed
+        if models is not None:
+            self.models = models
+        else:
+            if df is None:
+                raise ValueError("Either provide trained models or provide df to train.")
+            self.models = self.train_models(df)
+    def _build_filter(self, result):
+        # If type confidence doesn't clear the filter bar, the entire filter
+        # is unreliable — return None so retrieval does a full scan instead.
+        if result.get("type_conf", 0) < self.filter_threshold["type"]:
+            return None
+        # --- Hard AND anchors (always reliable) ---
+        hard_conditions = []
+        hard_conditions.append({"type": result["type"]})
+        # intent — handles special case for "count" to include "detail"
+        intent = result.get("intent") or "detail"
+        if intent == "count":
+            hard_conditions.append({"$or": [{"intent": "count"}, {"intent": "detail"}]})
+        else:
+            hard_conditions.append({"intent": intent})
+        # --- Soft OR hints (category / topic) ---
+        # A document only needs to match ONE of these to pass.
+        # This avoids dropping valid docs that are tagged with category but
+        # not topic (or vice-versa), while still keeping retrieval directional.
+        soft_conditions = []
+        if result.get("category") and result.get("category_conf", 0) >= self.filter_threshold["category"]:
+            soft_conditions.append({"category": result["category"]})
+        else:
+            soft_conditions.append({"category": "general"})
+        if result.get("topic") and result.get("topic_conf", 0) >= self.filter_threshold["topic"]:
+            soft_conditions.append({"topic": result["topic"]})
+        else:
+            soft_conditions.append({"topic": "general"})
+        # Build final filter
+        # Case 1: No soft hints — filter on hard anchors only (broad query like "list all departments")
+        if not soft_conditions:
+            if len(hard_conditions) == 1:
+                return hard_conditions[0]
+            return {"$and": hard_conditions}
+        # Case 2: One soft hint — add it directly to the AND (no $or needed)
+        if len(soft_conditions) == 1:
+            return {"$and": hard_conditions + soft_conditions}
+        # Case 3: Both category and topic are confident — combine as $or inside the AND
+        # Final shape: type AND intent AND (category OR topic)
+        return {"$and": hard_conditions + [{"$or": soft_conditions}]}
+    def predict_with_filter(self, queries):
+        filters = self.predict(queries)[0]
+        return self._build_filter(filters)
+    def expand_abbreviations(self, text):
+        text = text.lower().strip()
+        for abbr, full in self.abbreviations.items():
+            pattern = r'\b' + re.escape(abbr.lower()) + r'\b'  # ← lowercase the key too
+            text = re.sub(pattern, full, text)
+        return text
+    def get_features(self, queries):
+        queries_clean = [self.expand_abbreviations(q) for q in queries]
+        embeddings = self.embedding_model.encode(
+            queries_clean, show_progress_bar=False
+        )
+        if not hasattr(self.tfidf, "vocabulary_"):
+            tfidf_features = self.tfidf.fit_transform(queries_clean).toarray()
+        else:
+            tfidf_features = self.tfidf.transform(queries_clean).toarray()
+        return np.hstack([embeddings, tfidf_features])
+    def train_single(self, X, y, field, C=0.01):
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y,
+            test_size=0.2,
+            random_state=42,
+            stratify=y
+        )
+        clf = LogisticRegression(
+            C=C,
+            penalty="l2",
+            solver="lbfgs",
+            max_iter=2000,
+            class_weight="balanced",
+            random_state=42
+        )
+        clf.fit(X_train, y_train)
+        train_acc = clf.score(X_train, y_train)
+        test_acc = clf.score(X_test, y_test)
+        cv_scores = cross_val_score(clf, X, y, cv=5)
+        print(f"\n{field.upper()}:")
+        print(f"Train: {train_acc:.3f} | Test: {test_acc:.3f} | CV: {cv_scores.mean():.3f}")
+        return clf
+    def train_models(self, df):
+        X = self.get_features(df["question"].tolist())
+        self.models["type"] = self.train_single(
+            X, df["type"].values, "type", C=0.01
+        )
+        self.models["category"] = self.train_single(
+            X, df["category"].values, "category", C=0.005
+        )
+        self.models["topic"] = self.train_single(
+            X, df["topic"].values, "topic", C=0.005
+        )
+        self.models["intent"] = self.train_single(
+            X, df["intent"].values, "intent", C=0.005
+        )
+        return self.models
+    def predict(self, queries: List[str], enforce_constraints=True):
+        X = self.get_features(queries)
+        results = []
+        for i, query in enumerate(queries):
+            res = {"question": query}
+            # ---------- TYPE ----------
+            type_proba = self.models["type"].predict_proba([X[i]])[0]
+            type_classes = self.models["type"].classes_
+            type_idx = np.argmax(type_proba)
+            type_pred = type_classes[type_idx]
+            res["type"] = self.le_type.inverse_transform([type_pred])[0]
+            res["type_conf"] = float(type_proba[type_idx])
+            # ---------- CATEGORY ----------
+            category_proba = self.models["category"].predict_proba([X[i]])[0]
+            category_classes = self.models["category"].classes_
+            if enforce_constraints:
+                category_labels = self.le_category.inverse_transform(category_classes)
+                allowed = set(self.master_index[res["type"]]["categories"])
+                filtered = [
+                    (label, prob)
+                    for label, prob in zip(category_labels, category_proba)
+                    if label in allowed
+                ]
+                if filtered:
+                    best_category, best_prob = max(filtered, key=lambda x: x[1])
+                else:
+                    idx = np.argmax(category_proba)
+                    best_category = category_labels[idx]
+                    best_prob = category_proba[idx]
+                res["category"] = best_category
+                res["category_conf"] = float(best_prob)
+            else:
+                idx = np.argmax(category_proba)
+                pred = category_classes[idx]
+                res["category"] = self.le_category.inverse_transform([pred])[0]
+                res["category_conf"] = float(category_proba[idx])
+            # ---------- TOPIC ----------
+            topic_proba = self.models["topic"].predict_proba([X[i]])[0]
+            topic_classes = self.models["topic"].classes_
+            if enforce_constraints:
+                topic_labels = self.le_topic.inverse_transform(topic_classes)
+                allowed = set(self.master_index[res["type"]]["topics"])
+                filtered = [
+                    (label, prob)
+                    for label, prob in zip(topic_labels, topic_proba)
+                    if label in allowed
+                ]
+                if filtered:
+                    best_topic, best_prob = max(filtered, key=lambda x: x[1])
+                else:
+                    idx = np.argmax(topic_proba)
+                    best_topic = topic_labels[idx]
+                    best_prob = topic_proba[idx]
+                res["topic"] = best_topic
+                res["topic_conf"] = float(best_prob)
+            else:
+                idx = np.argmax(topic_proba)
+                pred = topic_classes[idx]
+                res["topic"] = self.le_topic.inverse_transform([pred])[0]
+                res["topic_conf"] = float(topic_proba[idx])
+            # ---------- INTENT ----------
+            intent_proba = self.models["intent"].predict_proba([X[i]])[0]
+            intent_classes = self.models["intent"].classes_
+            intent_idx = np.argmax(intent_proba)
+            intent_pred = intent_classes[intent_idx]
+            res["intent"] = self.le_intent.inverse_transform([intent_pred])[0]
+            res["intent_conf"] = float(intent_proba[intent_idx])
+            if res["type_conf"] < self.threshold["type"]:
+                res["type"] = None
+                res["type_conf"] = 0
+            if res["category_conf"] < self.threshold["category"]:
+                res["category"] = None
+                res["category_conf"] = 0
+            if res["topic_conf"] < self.threshold["topic"]:
+                res["topic"] = None
+                res["topic_conf"] = 0
+            if res["intent_conf"] < self.threshold["intent"]:
+                res["intent"] = None
+                res["intent_conf"] = 0
+            print("=" * 50)
+            print(query)
+            print(f"Type: {res['type']}, {res['type_conf']}")
+            print(f"Category: {res['category']}, {res['category_conf']}")
+            print(f"Topic: {res['topic']}, {res['topic_conf']}")
+            print(f"Intent: {res['intent']}, {res['intent_conf']}")
+            print("=" * 50)
+            results.append(res)
+        return results
+classifier_path = settings.classifier_path / "chatbot_classifier.pkl"
+pipeline = load_pipeline(classifier_path)
+models = pipeline["models"]
+tfidf = pipeline["tfidf"]
+le_type = pipeline["le_type"]
+le_category = pipeline["le_category"]
+le_topic = pipeline["le_topic"]
+le_intent = pipeline["le_intent"]
+MASTER_INDEX = pipeline["MASTER_INDEX"]
+ABBREVIATIONS = pipeline["ABBREVIATIONS"]
+clf = Classifier(
+    tfidf=tfidf,
+    abbreviations=ABBREVIATIONS,
+    master_index=MASTER_INDEX,
+    le_type=le_type,
+    le_category=le_category,
+    le_topic=le_topic,
+    le_intent=le_intent,
+    models=models
+)

app/services/document_loader.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from langchain_core.documents import Document
+from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
+from langchain_community.document_loaders import UnstructuredMarkdownLoader
+from pathlib import Path
+from datetime import datetime
+import uuid
+from typing import Optional, List
+class document_loader:
+    def __init__(self, filepath: Path, glob: str = "*.pdf"):
+        self.filepath = filepath
+        self.glob = glob
+        self.loader = PyPDFLoader
+    # loading services
+    def load(self):
+        doc_loader = PyPDFLoader(self.filepath)
+        return doc_loader.load()
+    def load_md(self):
+        return UnstructuredMarkdownLoader(self.filepath).load()
+    def lazy_load(self):
+        doc_loader = PyPDFLoader(self.filepath)
+        return doc_loader.lazy_load()
+    def load_multiple(self):
+        doc_loader = DirectoryLoader(
+            self.filepath,
+            glob=self.glob,
+            loader_cls=PyPDFLoader
+        )
+        return doc_loader.load()

app/services/file_service.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from app.core.config import settings
+from typing import List, Optional, Dict, Any
+from pathlib import Path
+from langchain_community.document_loaders import PyMuPDFLoader
+import json
+from datetime import datetime
+from app.utils.document_helpers import build_metadata
+from langchain_core.documents import Document
+import uuid
+from app.utils.preprocessing import preprocess_filename
+class FileService:
+    """`
+    FileService helps manage files and their metadata.
+    It stores file information in a central JSON file (e.g., vgec_rag.json).
+    """
+    def __init__(self):
+        self.settings = settings
+        # The name of the file where we store metadata
+        self.metadata_filename = f"{self.settings.collection_name}.json"
+        # The full path to that metadata file in the data folder
+        self.metadata_path = self.settings.data_path / self.metadata_filename
+        self.file_storage_path = self.settings.data_path / "documents"
+        # Load existing metadata if it exists, otherwise start fresh
+        if self.metadata_path.exists():
+            self.records = self.load_metadata()
+        else:
+            self.records = {}
+    def load_metadata(self) -> Dict[str, Any]:
+        """Reads the metadata from the JSON file."""
+        try:
+            with open(self.metadata_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except (json.JSONDecodeError, Exception):
+            return {}
+    def save_metadata(self):
+        """Saves current memory records back to the JSON file."""
+        # Ensure the data directory exists
+        self.metadata_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.metadata_path, 'w', encoding='utf-8') as f:
+            json.dump(self.records, f, indent=4)
+    def read_file(self, file_path: Path) -> Optional[str]:
+        """Reads content from a file and updates the logs."""
+        if not file_path.exists():
+            return None
+        file_name = preprocess_filename(file_path)
+        if file_name.endswith(".pdf"):
+            documents = PyMuPDFLoader(file_path).load()
+            content = "\n".join([doc.page_content for doc in documents])
+            print(content)
+            metadata = {
+                "page_count": len(documents),
+                "ext": "pdf"
+            }
+        elif file_name.endswith(".txt"):
+            split_content_metadata = build_metadata(file_path)
+            inbuilt_metadata = split_content_metadata['metadata']
+            content = split_content_metadata['content']
+            metadata = {
+                **inbuilt_metadata,
+                "ext": "txt"
+            }
+        elif file_name.endswith(".md"):
+            split_content_metadata = build_metadata(file_path)
+            inbuilt_metadata = split_content_metadata['metadata']
+            content = split_content_metadata['content']
+            metadata = {
+                **inbuilt_metadata,
+                "ext": "md"
+            }
+        elif file_name.endswith(".json"):
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                content = json.dumps(data["content"])
+                metadata = {
+                    "id": data["id"],
+                    "title": data.get("name", data.get("title", "untitled")),
+                    "source": data["source"],
+                    "source_file": file_name or "untitled",
+                    "created_date": datetime.now().isoformat(),
+                    "type": data.get("type", "general"),
+                    "category": data.get("category", "general"),
+                    "topic": data.get("topic", "general"),
+                    "ext": "json"
+                }
+        # file_name
+        doc = Document(page_content=content, metadata=metadata)
+        doc.metadata["id"] = doc.metadata.get(
+            "id",
+            str(uuid.uuid4())
+        )
+        doc.metadata["title"] = doc.metadata.get(
+            "title",
+            file_name
+        )
+        doc.metadata["source_file"] = doc.metadata.get(
+            "source_file",
+            file_name
+        )
+        doc.metadata["updated_at"] = datetime.now().isoformat()
+        doc.metadata["created_at"] = doc.metadata.get(
+            "created_at",
+            datetime.now().isoformat()
+        )
+        # Update logs to reflect that we interacted with this file
+        self.update_logs(file_path, metadata)
+        return doc
+    def write_file(self, file_path: Path, content: str, metadata: Optional[dict] = None):
+        """Writes content to a file and saves its metadata."""
+        # Ensure the directory for the file exists
+        filename = preprocess_filename(file_path)
+        file_save_path = self.file_storage_path / filename
+        file_save_path.parent.mkdir(parents=True, exist_ok=True)
+        if filename.endswith(".pdf"):
+            with open(file_path, 'rb') as f:
+                content = f.read()
+            with open(file_save_path, 'wb') as f:
+                f.write(content)
+        elif filename.endswith(".txt"):
+            with open(file_save_path, 'w', encoding='utf-8') as f:
+                f.write(content)
+        elif filename.endswith(".md"):
+            with open(file_save_path, 'w', encoding='utf-8') as f:
+                f.write(content)
+        elif filename.endswith(".json"):
+            with open(file_save_path, 'w', encoding='utf-8') as f:
+                json.dump(content, f, indent=4)
+        else:
+            with open(file_save_path, 'w', encoding='utf-8') as f:
+                f.write(content)
+        # Update the logs with the provided metadata
+        self.update_logs(file_save_path, metadata)
+    def update_logs(self, file_path: Path, metadata: Optional[dict] = None):
+        """Helper to prepare metadata before saving."""
+        file_name = file_path.name
+        # If no metadata is provided, we try to preserve existing
+        # metadata or use an empty dict if it's new.
+        if metadata is None:
+            metadata = self.records.get(file_name, {})
+        self.manage_metadata(file_name, metadata)
+    def manage_metadata(self, file_name: str, metadata: dict):
+        """Updates the internal dictionary and saves it to the disk."""
+        self.records[file_name] = metadata
+        self.save_metadata()
+    def patch_metadata(self, file_path: Path, metadata: dict):
+        file_name = file_path.name
+        original_metadata = self.records.get(file_name, {})
+        self.manage_metadata(
+            file_name= file_name,
+            metadata= {
+                **original_metadata,
+                **metadata
+            }
+        )
+    def get_records(self) -> Dict[str, Any]:
+        """Returns all stored metadata records."""
+        return self.records
+    def get_record(self, file_name: str) -> Optional[Dict[str, Any]]:
+        """Returns metadata for a specific file."""
+        return self.records.get(file_name)
+    def delete_record(self, file_name: str) -> bool:
+        """Removes a metadata record from the JSON file."""
+        if file_name in self.records:
+            del self.records[file_name]
+            self.save_metadata()
+            return True
+        return False
+    def update_record(self, file_name: str, metadata: dict) -> bool:
+        """Updates the metadata for an existing record."""
+        if file_name in self.records:
+            self.records[file_name] = metadata
+            self.save_metadata()
+            return True
+        return False
+# Initialize a globally accessible service instance
+file_service = FileService()

app/services/filter-demo ADDED Viewed

	@@ -0,0 +1,197 @@

+ANCHORS = {
+    "type": {
+        "department":  "department academic branch faculty courses engineering science",
+        "facility":    "facility building campus lab central infrastructure",
+        "service":     "service office administration student support section",
+        "hostel":      "hostel dormitory residence accommodation warden mess",
+        "library":     "library books journal reading catalog lending",
+        "placement":   "placement recruitment company offer package career tnp",
+        "research":    "research patent publication funded grant scholar",
+        "club":        "club society committee nss ncc ieee cultural extracurricular",
+        "admission":   "admission enrollment intake eligibility registration criteria",
+    },
+    "category": {
+        # Keep it short — name first, then 4-5 unique discriminative terms
+        "applied_mechanics":  "applied mechanics AM statics dynamics stress strain",
+        "chemical":           "chemical engineering ChE chemistry reaction process plant",
+        "civil":              "civil engineering CE construction structural geotechnical survey",
+        "computer":           "computer engineering CE hardware microprocessor VLSI embedded digital",
+        "cse_ds":             "computer science CSE data science AI machine learning neural network",
+        "electronics_comm":   "electronics communication ECE signal RF wireless antenna analog",
+        "electronics_inst":   "electronics instrumentation EI biomedical sensors transducer measurement",
+        "electrical":         "electrical engineering EE power motor transformer transmission drives",
+        "it":                 "information technology IT software ERP cloud database devops",
+        "ict":                "information communication technology ICT telecom fiber networking protocol",
+        "instrumentation":    "instrumentation control IC PLC SCADA automation feedback industrial",
+        "mechanical":         "mechanical engineering ME thermal manufacturing CAD CAM machining fluid",
+        "power_electronics":  "power electronics PE converter inverter MOSFET IGBT rectifier chopper",
+        "science_humanities": "science humanities SH physics mathematics english communication foundation",
+        "transport":   "transport bus route commute shuttle pickup drop campus travel",
+        "finance":     "finance fees tuition payment charges fine scholarship due",
+        "medical":     "medical health doctor dispensary clinic nurse first aid",
+        "sports":      "sports ground gym cricket football badminton court fitness",
+        "grievance":   "grievance complaint harassment redressal scst women discrimination",
+        "forms":       "forms bonafide certificate download application NOC document",
+        "ug":  "undergraduate UG BE BTech bachelor four year first year gujcet jee",
+        "pg":  "postgraduate PG ME MTech MBA MCA master two year gate",
+        "tnp":         "tnp training placement cell campus drive offer letter coordinator",
+        "patent":      "patent intellectual property invention filed granted rights",
+        "ssip":        "ssip startup student innovation gujarat seed funding incubation",
+        "funded":      "funded grant DST DRDO ISRO sponsored external research project",
+        "publication": "publication journal paper conference scopus SCI citation article",
+        "nss":        "nss national service scheme volunteer blood donation community camp",
+        "ncc":        "ncc national cadet corps army navy air force drill parade",
+        "ieee":       "ieee electrical electronics engineers student chapter symposium",
+        "iei":        "iei institution engineers india professional chapter membership",
+        "adventure":  "adventure trekking hiking outdoor camping expedition nature club",
+        "women_cell": "women cell WDC empowerment gender equality ladies committee",
+        "principal":     "principal director head institution chairman governing management",
+        "accreditation": "accreditation NBA NIRF NAAC AICTE GTU ranking grade approval",
+        "awards":        "awards achievements recognition felicitation distinction honor trophy",
+    },
+    "topic": {
+        "faculty":      "faculty professor lecturer HOD staff designation qualification phd",
+        "lab":          "lab laboratory practical equipment instruments apparatus experiment",
+        "syllabus":     "syllabus curriculum subjects units topics chapters semester GTU",
+        "timetable":    "timetable class schedule period slot timing routine batch",
+        "event":        "event fest hackathon seminar competition workshop cultural program",
+        "project":      "project final year mini SIH capstone dissertation submission",
+        "virtual_tour": "virtual tour 360 view online walkthrough campus interactive",
+        "notice":       "notice notification announcement circular bulletin update board",
+        "fees":         "fees tuition charges structure breakdown payment due scholarship",
+        "rules":        "rules regulations discipline policy conduct code guidelines norms",
+        "facilities":   "facilities amenities wifi mess canteen gym recreation available",
+        "contact":      "contact phone email address reach person call office",
+        "process":      "process procedure steps apply method eligibility criteria workflow",
+        "document":     "document certificate bonafide migration TC attestation official",
+        "route":        "route bus stop pickup drop timing schedule commute point",
+        "stats":        "statistics total number count figures record percentage ratio data",
+        "calendar":     "calendar academic dates holidays exam deadlines semester schedule",
+        "vision":       "vision mission goals objectives values purpose motto statement",
+        "induction":    "induction orientation welcome freshman new student speaker activity",
+    },
+    "intent": {
+        "list":     "list all show every what are enumerate display available options",
+        "count":    "how many total count number quantity strength size",
+        "detail":   "what is explain describe tell me about information overview",
+        "process":  "how to apply steps procedure guide method approach eligibility",
+        "greeting": "hello hi hey good morning good evening namaste greetings",
+    },
+}
+ABBREVIATIONS = {
+    "ce": "computer engineering",
+    "cse": "computer science engineering",
+    "ds": "data science",
+    "it": "information technology",
+    "ict": "information communication technology",
+    "ece": "electronics communication engineering",
+    "ei": "electronics instrumentation engineering",
+    "ic": "instrumentation control",
+    "ee": "electrical engineering",
+    "pe": "power electronics",
+    "me": "mechanical engineering",
+    "am": "applied mechanics",
+}
+MASTER_INDEX = {
+    "department": {
+        "categories": [
+            "applied_mechanics", "chemical", "civil", "computer", "cse_ds",
+            "electronics_comm", "electronics_inst", "electrical", "it", "ict",
+            "instrumentation", "mechanical", "power_electronics", "science_humanities"
+        ],
+        "topics": [
+            "faculty", "lab", "syllabus", "timetable", "event",
+            "project", "virtual_tour", "notice", "contact", "stats"
+        ]
+    },
+    "facility":   {"categories": [None], "topics": ["lab", "facilities", "contact", "virtual_tour"]},
+    "service":    {
+        "categories": ["transport", "finance", "medical", "sports", "grievance", "forms", "principal", "accreditation", "awards"],
+        "topics": ["fees", "rules", "process", "document", "route", "contact", "calendar", "stats", "vision", "induction"]
+    },
+    "hostel":     {"categories": [None], "topics": ["fees", "rules", "facilities", "contact"]},
+    "library":    {"categories": [None], "topics": ["fees", "rules", "facilities", "contact", "calendar", "document"]},
+    "placement":  {"categories": ["tnp"], "topics": ["stats", "process", "contact", "event"]},
+    "research":   {"categories": ["patent", "ssip", "funded", "publication"], "topics": ["stats", "project", "process", "contact"]},
+    "club":       {"categories": ["nss", "ncc", "ieee", "iei", "adventure", "women_cell"], "topics": ["event", "contact", "process", "stats", "notice"]},
+    "admission":  {"categories": ["ug", "pg"], "topics": ["process", "fees", "document", "result", "calendar", "contact"]}
+}
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import re
+from app.core.config import settings
+from typing import Optional
+class FilterClassifier:
+    def __init__(self, threshold: Optional[float] = None):
+        self.anchor_embeddings = {}
+        self.model = SentenceTransformer(str(settings.embeddings_path / "bge-small"))
+        self._build_anchor_embeddings()
+        self.threshold = threshold if threshold is not None else 0.4
+    def _build_anchor_embeddings(self):
+        if self.anchor_embeddings:
+            return
+        for domain, anchors in ANCHORS.items():
+            self.anchor_embeddings[domain] = {}
+            for label, text in anchors.items():
+                self.anchor_embeddings[domain][label] = self.model.encode(text)
+    def handle_abbreviations(self, query: str) -> str:
+        for abbr, full_form in ABBREVIATIONS.items():
+            query = query.replace(abbr, full_form)
+        return query
+    def classify(self, query: str) -> dict:
+        query = self.handle_abbreviations(query)
+        query_emb = self.model.encode(query)
+        result = {"type":None,"category":None,"topic":None,"year":None,"intent":None}
+        for field, value_embeddings in self.anchor_embeddings.items ():
+            scores = {
+                val: cosine_similarity([query_emb], [emb])[0][0]
+                for val, emb in value_embeddings.items()
+            }
+            print(scores, max(scores, key = scores.get))
+            best_val  = max(scores, key=scores.get)
+            best_score = scores[best_val]
+            print(best_val, best_score)
+            # print(field, result[field])
+            # Only accept if confidence is above threshold
+            if best_score > self.threshold:
+                result[field] = best_val
+        year = re.search(r"\b(20\d{2})\b", query)
+        result["year"] = int(year.group()) if year else None
+        if result["type"] is not None:
+            if (result["category"] is None
+                    or result["category"] not in MASTER_INDEX[result["type"]]["categories"]):
+                result["category"] = None
+            if (result["topic"] is None
+                    or result["topic"] not in MASTER_INDEX[result["type"]]["topics"]):
+                result["topic"] = None
+        else:
+            result["category"] = None
+            result["topic"] = None
+        return result
+classifier = FilterClassifier()

app/services/filter_classifier copy.py ADDED Viewed

	@@ -0,0 +1,334 @@

+ANCHORS = {
+    # ─────────────────────────────────────────────────
+    # TYPE — discriminative, non-overlapping
+    # ─────────────────────────────────────────────────
+    "type": {
+        "department":   "department academic branch division faculty staff courses offered semester",
+        "facility":     "facility central infrastructure campus physical space building block floor room",
+        "service":      "service administrative office cell section support helpdesk student welfare",
+        "hostel":       "hostel dormitory residence hall accommodation mess warden boarding lodging",
+        "library":      "library books journals reading room catalog issue return lending periodicals",
+        "placement":    "placement recruitment hired campus drive offer letter company package lpa tnp",
+        "research":     "research innovation funded grant patent publication lab project scholar phd",
+        "club":         "club society student chapter committee extracurricular nss ncc ieee cultural sports",
+        "admission":    "admission enrollment application intake registration eligibility criteria merit joining",
+    },
+    # ─────────────────────────────────────────────────
+    # CATEGORY — must contain the obvious name first
+    # ─────────────────────────────────────────────────
+    "category": {
+        # ── departments ──
+        "applied_mechanics":  "applied mechanics AM statics dynamics solid mechanics fluid mechanics stress strain deformation",
+        "chemical":           "chemical engineering ChE chemistry process plant reaction distillation thermodynamics petrochemical",
+        "civil":              "civil engineering CE construction structural geotechnical surveying transportation concrete roads bridges",
+        "computer":           "computer engineering CE computer hardware microprocessor VLSI embedded systems digital circuits processor chip architecture",
+        "cse_ds":             "computer science CSE data science DS artificial intelligence machine learning neural network deep learning NLP analytics algorithm",
+        "electronics_comm":   "electronics communication ECE EC signal processing analog RF wireless antenna microwave telecommunication",
+        "electronics_inst":   "electronics instrumentation EI biomedical sensors LVDT transducer measurement calibration control systems",
+        "electrical":         "electrical engineering EE power systems generation transmission distribution motor transformer drives induction synchronous",
+        "it":                 "information technology IT software development ERP cloud computing database devops web application enterprise",
+        "ict":                "information communication technology ICT telecom networking fiber optic protocol bandwidth routing switching internet",
+        "instrumentation":    "instrumentation control IC PLC SCADA automation process control feedback loop industrial plant",
+        "mechanical":         "mechanical engineering ME mech thermal fluid manufacturing machining CAD CAM turbine heat transfer production",
+        "power_electronics":  "power electronics PE converter inverter MOSFET IGBT rectifier chopper switching drives variable frequency",
+        "science_humanities": "science humanities SH physics chemistry mathematics english communication basic science applied science foundation",
+        # ── service ──
+        "transport":    "transport bus route commute shuttle vehicle pickup drop timing campus travel conveyance",
+        "finance":      "finance fees tuition payment semester charges fine scholarship refund due bank challan",
+        "medical":      "medical health doctor dispensary clinic first aid nurse campus sick injury treatment",
+        "sports":       "sports ground gym fitness cricket football badminton volleyball court track field athletics",
+        "grievance":    "grievance complaint redressal harassment scst obc women discrimination appeal committee inquiry",
+        "forms":        "forms download bonafide certificate application document tc migration no objection NOC",
+        # ── admission ──
+        "ug":  "undergraduate UG BE BTech bachelor four year degree engineering first year admission gujcet jee lateral",
+        "pg":  "postgraduate PG ME MTech MBA MCA master two year degree admission gate mat entrance",
+        # ── placement ──
+        "tnp": "tnp training placement cell campus recruitment company drive package offer letter placement officer coordinator",
+        # ── research ──
+        "patent":      "patent intellectual property IP invention filed granted innovation protection rights",
+        "ssip":        "ssip startup student innovation project gujarat government seed funding incubation entrepreneurship",
+        "funded":      "funded sponsored externally grant DST ISRO DRDO government industry collaborative research project",
+        "publication": "publication journal paper conference proceedings scopus SCI research article citation author",
+        # ── club ──
+        "nss":        "nss national service scheme volunteer social community service blood donation camp awareness",
+        "ncc":        "ncc national cadet corps army navy air force cadet drill parade certificate b c",
+        "ieee":       "ieee institute electrical electronics engineers student chapter technical symposium paper",
+        "iei":        "iei institution engineers india professional body student chapter membership",
+        "adventure":  "adventure advanature nature trekking outdoor hiking camping expedition rock climbing club",
+        "women_cell": "women development cell WDC empowerment gender equality ladies committee harassment redressal",
+        # ── administration ──
+        "principal":      "principal director head of institution management chairman governing body top administration",
+        "accreditation":  "accreditation NBA NIRF NAAC AICTE GTU affiliation ranking approval grade score",
+        "awards":         "awards achievements recognition felicitation distinction honor national state rank trophy",
+    },
+    # ─────────────────────────────────────────────────
+    # TOPIC — aspects, clearly separated from each other
+    # ─────────────────────────────────────────────────
+    "topic": {
+        "faculty":       "faculty professor lecturer instructor assistant professor associate professor HOD teaching staff designation qualification phd",
+        "lab":           "laboratory lab practical experiment equipment instruments workshop hands-on setup apparatus bench",
+        "syllabus":      "syllabus curriculum course content subjects units topics chapters semester wise GTU prescribed",
+        "timetable":     "timetable class schedule routine period slot lecture timing weekly daily batch division",
+        "event":         "event events fest hackathon seminar workshop competition cultural technical program organized upcoming",
+        "project":       "project final year mini SIH capstone student work dissertation major submission",
+        "virtual_tour":  "virtual tour 360 degree view online walkthrough campus room infrastructure interactive map",
+        "notice":        "notice notification announcement circular update bulletin board recent latest information",
+        "fees":          "fees tuition charges amount structure semester breakdown fine late scholarship payment due",
+        "rules":         "rules regulations discipline policy code conduct guidelines norms behaviour dress restriction",
+        "facilities":    "facilities amenities available infrastructure wifi internet mess canteen gym recreation services provided",
+        "contact":       "contact phone number email address reach person call department office location",
+        "process":       "process procedure steps how to apply method eligibility criteria requirement workflow sequence",
+        "document":      "document certificate bonafide migration leaving tc attestation verification required official",
+        "route":         "route bus stop timing pickup drop point schedule commute map destination",
+        "stats":         "statistics data figures record total number count percentage ratio achievement placement pass",
+        "calendar":      "calendar academic dates holidays exam schedule important deadlines events semester start end",
+        "vision":        "vision mission goals objectives values purpose statement motto philosophy aim",
+        "induction":     "induction orientation welcome program new student freshman speaker activity schedule",
+    },
+    # ─────────────────────────────────────────────────
+    # INTENT — must be semantically far apart
+    # ─────────────────────────────────────────────────
+    "intent": {
+        "list":     "list all show every what are all available options display give me all enumerate",
+        "count":    "how many total count number quantity how much strength size",
+        "detail":   "what is explain describe tell me about information overview summary background",
+        "process":  "how to apply steps procedure method way guide eligibility criteria approach",
+        "greeting": "hello hi hey good morning good afternoon good evening how are you namaste greetings",
+    },
+}
+ABBREVIATIONS = {
+    "ce": "computer engineering",
+    "cse": "computer science engineering",
+    "ds": "data science",
+    "it": "information technology",
+    "ict": "information communication technology",
+    "ece": "electronics communication engineering",
+    "ei": "electronics instrumentation engineering",
+    "ic": "instrumentation control",
+    "ee": "electrical engineering",
+    "pe": "power electronics",
+    "me": "mechanical engineering",
+    "am": "applied mechanics",
+    "che": "chemical engineering",
+    "ch": "chemical",
+    "ce": "civil engineering",
+    "ug": "undergraduate",
+    "pg": "postgraduate",
+    "be": "bachelor of engineering",
+    "btech": "bachelor of technology",
+    "me": "master of engineering",
+    "mtech": "master of technology",
+    "mba": "master of business administration",
+    "mca": "master of computer applications",
+    "tnp": "training and placement",
+    "nss": "national service scheme",
+    "ncc": "national cadet corps",
+    "ieee": "institute of electrical and electronics engineers",
+    "iei": "institution of engineers india",
+    "wdc": "women development cell",
+    "sip": "student innovation project",
+    "gtu": "gujarat technological university",
+    "nba": "national board of accreditation",
+    "naac": "national assessment and accreditation council",
+    "nirf": "national institutional ranking framework",
+    "aicte": "all india council for technical education",
+    "drdo": "defence research and development organisation",
+    "isro": "indian space research organisation",
+    "dst": "department of science and technology",
+    "sih": "smart india hackathon",
+    "lpa": "lakhs per annum",
+    "noc": "no objection certificate",
+    "tc": "transfer certificate",
+    "hod": "head of department",
+    "phd": "doctor of philosophy",
+    "scada": "supervisory control and data acquisition",
+    "plc": "programmable logic controller",
+    "lvdt": "linear variable differential transformer",
+    "mosfet": "metal oxide semiconductor field effect transistor",
+    "igbt": "insulated gate bipolar transistor",
+    "vlsi": "very large scale integration",
+    "cad": "computer aided design",
+    "cam": "computer aided manufacturing",
+    "erp": "enterprise resource planning",
+    "rf": "radio frequency",
+    "nlp": "natural language processing",
+    "ai": "artificial intelligence",
+    "ml": "machine learning",
+    "scopus": "scopus",
+    "sci": "science citation index",
+    "ip": "intellectual property",
+}
+MASTER_INDEX = {
+    "department": {
+        "categories": [
+            "applied_mechanics", "chemical", "civil", "computer", "cse_ds",
+            "electronics_comm", "electronics_inst", "electrical", "it", "ict",
+            "instrumentation", "mechanical", "power_electronics", "science_humanities"
+        ],
+        "topics": [
+            "faculty", "lab", "syllabus", "timetable", "event",
+            "project", "virtual_tour", "notice", "contact"
+        ]
+    },
+    "facility":   {"categories": [None], "topics": ["lab", "facilities", "contact", "virtual_tour"]},
+    "service":    {
+        "categories": ["transport", "finance", "medical", "sports", "grievance", "forms", "principal", "accreditation", "awards"],
+        "topics": ["fees", "rules", "process", "document", "route", "contact", "calendar", "stats", "vision", "induction"]
+    },
+    "hostel":     {"categories": [None], "topics": ["fees", "rules", "facilities", "contact"]},
+    "library":    {"categories": [None], "topics": ["fees", "rules", "facilities", "contact", "calendar", "document"]},
+    "placement":  {"categories": ["tnp"], "topics": ["stats", "process", "contact", "event"]},
+    "research":   {"categories": ["patent", "ssip", "funded", "publication"], "topics": ["stats", "project", "process", "contact"]},
+    "club":       {"categories": ["nss", "ncc", "ieee", "iei", "adventure", "women_cell"], "topics": ["event", "contact", "process", "stats", "notice"]},
+    "admission":  {"categories": ["ug", "pg"], "topics": ["process", "fees", "document", "result", "calendar", "contact"]}
+}
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import re
+from app.core.config import settings
+from typing import Optional
+from rank_bm25 import BM25Okapi
+FIELD_THRESHOLDS = {
+    "type": 0.25,      # Was 0.5 - too high for embedding-heavy field
+    "category": 0.5,   # Keep - BM25-heavy works well
+    "topic": 0.4,      # Was 0.5 - slight reduction
+    "intent": 0.5,     # Keep - usually clear signals
+}
+FIELD_WEIGHTS = {
+    "type":     (0.6, 0.4),   # embedding-heavy — semantic
+    "category": (0.35, 0.65), # BM25-heavy — exact names matter most
+    "topic":    (0.55, 0.45),
+    "intent":   (0.7, 0.3),   # embedding-heavy — semantic intent
+}
+class FilterClassifier:
+    def __init__(self, threshold=None):
+        self.model = SentenceTransformer(str(settings.embeddings_path / "bge-small"))
+        self.anchor_embeddings = {}
+        self.bm25_classifiers = {}
+        self.anchor_keys = {}
+        self._build_anchor_embeddings()
+        self._build_bm25()
+        self.threshold = threshold
+    def _build_anchor_embeddings(self):
+        for field, anchors in ANCHORS.items():
+            self.anchor_embeddings[field] = {
+                label: self.model.encode(f"{label} {text}")
+                for label, text in anchors.items()
+            }
+    def _build_bm25(self):
+        for field, anchors in ANCHORS.items():
+            keys = list(anchors.keys())
+            docs = [f"{label} {text}".lower().split() for label, text in anchors.items()]
+            self.anchor_keys[field] = keys
+            self.bm25_classifiers[field] = BM25Okapi(docs)
+    def handle_abbreviations(self, query: str) -> str:
+        tokens = query.lower().split()
+        expanded = [ABBREVIATIONS.get(t, t) for t in tokens]
+        return " ".join(expanded)
+    def preprocess(self, query: str) -> str:
+        # Remove punctuation except spaces
+        query = re.sub(r'[^\w\s]', ' ', query.lower())
+        # Handle multiple spaces
+        query = re.sub(r'\s+', ' ', query).strip()
+        return query
+    def classify(self, query: str) -> dict:
+        query = self.handle_abbreviations(query)
+        query = self.preprocess(query)
+        query_emb = self.model.encode(query)
+        tokenized = query.lower().split()
+        result = {"type": None, "category": None, "topic": None, "year": None, "intent": None}
+        # 1. Classify Primary Fields (Type and Intent)
+        result["type"] = self._get_best_match("type", query_emb, tokenized)
+        result["intent"] = self._get_best_match("intent", query_emb, tokenized)
+        # 2. Extract Year (Independent)
+        year_match = re.search(r"\b(20\d{2})\b", query)
+        result["year"] = int(year_match.group()) if year_match else None
+        # 3. Cascading Classification for Category and Topic
+        if result["type"]:
+            valid_config = MASTER_INDEX.get(result["type"], {})
+            # Filtered Category
+            valid_cats = valid_config.get("categories", [])
+            if valid_cats and valid_cats != [None]:
+                result["category"] = self._get_best_match("category", query_emb, tokenized, allowed_labels=valid_cats)
+            # Filtered Topic
+            valid_topics = valid_config.get("topics", [])
+            if valid_topics:
+                result["topic"] = self._get_best_match("topic", query_emb, tokenized, allowed_labels=valid_topics)
+        return result
+    def _get_best_match(self, field: str, query_emb: np.ndarray, tokenized: list, allowed_labels: list = None) -> Optional[str]:
+        """Helper to find the best match for a field, optionally restricted to a subset of labels."""
+        keys = self.anchor_keys[field]
+        value_embeddings = self.anchor_embeddings[field]
+        # If restricted, only consider allowed labels
+        target_keys = allowed_labels if allowed_labels else keys
+        # 1. Embedding scores
+        emb_scores = {
+            val: cosine_similarity([query_emb], [value_embeddings[val]])[0][0]
+            for val in target_keys if val in value_embeddings
+        }
+        # 2. BM25 scores (subset aware)
+        raw_bm25 = self.bm25_classifiers[field].get_scores(tokenized)
+        global_bm25_max = max(raw_bm25) if len(raw_bm25) > 0 and max(raw_bm25) > 0 else 1
+        # We need to map global BM25 scores to our subset
+        subset_bm25 = {}
+        for val in target_keys:
+            if val in keys:
+                idx = keys.index(val)
+                subset_bm25[val] = raw_bm25[idx]
+        # Normalize BM25 scores using the GLOBAL maximum to keep perspective
+        normalized_bm25 = {v: s / global_bm25_max for v, s in subset_bm25.items()}
+        # 3. Combine with Weights
+        emb_w, bm25_w = FIELD_WEIGHTS[field]
+        combined = {
+            val: (emb_w * emb_scores.get(val, 0)) + (bm25_w * normalized_bm25.get(val, 0))
+            for val in target_keys
+        }
+        if not combined:
+            return None
+        best_val = max(combined, key=combined.get)
+        best_score = combined[best_val]
+        print(f"{field}: {best_val} | combined={best_score:.3f} emb={emb_scores.get(best_val,0):.3f} bm25={normalized_bm25.get(best_val,0):.3f}")
+        return best_val if best_score > FIELD_THRESHOLDS[field] else None
+classifier = FilterClassifier()

app/services/filter_classifier.py ADDED Viewed

	@@ -0,0 +1,529 @@

+ANCHORS = {
+    # ─────────────────────────────────────────────────
+    # TYPE — written as exemplar descriptions so the
+    # SentenceTransformer embeddings are maximally
+    # separated. Each entry must be clearly distinct.
+    # ─────────────────────────────────────────────────
+    "type": {
+        "department": (
+            "Which engineering department offers this course? "
+            "Tell me about the academic branch, its faculty, subjects, labs, syllabus and semester structure."
+        ),
+        "facility": (
+            "Where is this campus facility located? "
+            "Show me the central infrastructure, physical spaces, buildings, blocks, floors and rooms on campus."
+        ),
+        "service": (
+            "How do I use this administrative service? "
+            "I need help from an office, cell, section or support desk for student welfare."
+        ),
+        "hostel": (
+            "What are the hostel rules and accommodation details? "
+            "I want to know about the dormitory, residence hall, mess, warden and boarding facilities."
+        ),
+        "library": (
+            "How do I borrow books from the library? "
+            "Tell me about the reading room, book catalog, journal issue, return and lending system."
+        ),
+        "placement": (
+            "Which companies came for campus placement this year? "
+            "I want to know about recruitment drives, offer letters, packages, TNP cell and placement statistics."
+        ),
+        "research": (
+            "How do I apply for a funded research project? "
+            "Tell me about grants, patents, publications, PhD scholars and innovation at the institute."
+        ),
+        "club": (
+            "How do I join a student club or society? "
+            "Tell me about extracurricular chapters like NSS, NCC, IEEE and cultural or sports committees."
+        ),
+        "admission": (
+            "What is the admission process to join this college? "
+            "I want to know about enrollment, eligibility criteria, merit list, application and registration."
+        ),
+    },
+    # ─────────────────────────────────────────────────
+    # CATEGORY — exemplar sentences, grouped by parent
+    # type. Unique, distinctive keywords embedded in
+    # natural sentences to prevent cross-category leaks.
+    # ─────────────────────────────────────────────────
+    "category": {
+        # ── departments ──
+        "applied_mechanics": (
+            "The Applied Mechanics department covers statics, dynamics, solid mechanics, "
+            "fluid mechanics, stress-strain analysis and structural deformation."
+        ),
+        "chemical": (
+            "The Chemical Engineering department covers chemistry, thermodynamics, reaction engineering, "
+            "distillation, process plant design and petrochemical processes."
+        ),
+        "civil": (
+            "The Civil Engineering department covers structural engineering, construction, "
+            "geotechnical surveying, transportation, concrete design, roads and bridges."
+        ),
+        "computer": (
+            "The Computer Engineering department covers computer hardware, microprocessors, "
+            "VLSI design, embedded systems, digital circuits and chip architecture."
+        ),
+        "cse_ds": (
+            "The Computer Science and Data Science department covers algorithms, artificial intelligence, "
+            "machine learning, neural networks, deep learning, NLP and data analytics."
+        ),
+        "electronics_comm": (
+            "The Electronics and Communication Engineering department covers signal processing, "
+            "analog circuits, RF systems, wireless communication, antennas and microwave technology."
+        ),
+        "electronics_inst": (
+            "The Electronics and Instrumentation Engineering department covers sensors, transducers, "
+            "LVDT, biomedical instrumentation, measurement, calibration and control systems."
+        ),
+        "electrical": (
+            "The Electrical Engineering department covers power systems, generation, transmission, "
+            "distribution, motors, transformers, induction machines and synchronous drives."
+        ),
+        "it": (
+            "The Information Technology department covers software development, ERP systems, "
+            "cloud computing, databases, DevOps, web applications and enterprise solutions."
+        ),
+        "ict": (
+            "The Information and Communication Technology department covers telecom, networking, "
+            "fiber optics, routing, switching, bandwidth and internet protocols."
+        ),
+        "instrumentation": (
+            "The Instrumentation and Control department covers PLC, SCADA, automation, "
+            "process control, feedback loops and industrial control plant systems."
+        ),
+        "mechanical": (
+            "The Mechanical Engineering department covers manufacturing, machining, thermal engineering, "
+            "fluid mechanics, CAD CAM design, turbines, heat transfer and production."
+        ),
+        "power_electronics": (
+            "The Power Electronics department covers converters, inverters, MOSFETs, IGBTs, "
+            "rectifiers, choppers, variable frequency drives and switching circuits."
+        ),
+        "science_humanities": (
+            "The Science and Humanities department covers applied physics, chemistry, mathematics, "
+            "English communication and basic foundation sciences for engineering."
+        ),
+        # ── service categories ──
+        "transport": (
+            "The college transport service operates bus routes for commuting. "
+            "I want to know the bus stop, timing, pickup and drop schedule and shuttle conveyance."
+        ),
+        "finance": (
+            "The finance office handles tuition fees, semester payment, fine, scholarship, refund and bank challan."
+        ),
+        "medical": (
+            "The campus medical facility has a doctor, dispensary and clinic for first aid, "
+            "nursing and student health treatment."
+        ),
+        "sports": (
+            "The sports facility has grounds, a gym and courts for cricket, football, "
+            "badminton, volleyball, athletics and fitness activities."
+        ),
+        "grievance": (
+            "The grievance redressal cell handles student complaints about harassment, "
+            "discrimination based on SC ST OBC, gender issues and appeal inquiries."
+        ),
+        "forms": (
+            "I need to download a bonafide certificate, NOC, migration form, TC or no-objection document from the college."
+        ),
+        # ── admission categories ──
+        "ug": (
+            "Undergraduate BE BTech bachelor degree admission through GUJCET JEE. "
+            "Four year engineering program, first year intake, lateral entry eligibility."
+        ),
+        "pg": (
+            "Postgraduate ME MTech MBA MCA master degree admission through GATE MAT entrance exam. "
+            "Two year program eligibility and registration process."
+        ),
+        # ── placement categories ──
+        "tnp": (
+            "The Training and Placement cell organizes campus recruitment drives. "
+            "Companies visit for interviews, offer letters and placement packages are coordinated by the placement officer."
+        ),
+        # ── research categories ──
+        "patent": (
+            "A patent was filed for a new invention idea. "
+            "Intellectual property protection, granted innovation rights for student or faculty work."
+        ),
+        "ssip": (
+            "The SSIP scheme funds student startup and innovation projects. "
+            "Gujarat government provides seed money, incubation and entrepreneurship support."
+        ),
+        "funded": (
+            "This is a sponsored research project funded by DST, ISRO or DRDO. "
+            "External grant, industry collaboration, government funded research work."
+        ),
+        "publication": (
+            "A research paper was published in a Scopus or SCI journal. "
+            "Conference proceedings, citation, article authorship and research publication record."
+        ),
+        # ── club categories ──
+        "nss": (
+            "NSS National Service Scheme organizes volunteer activities, blood donation camps, "
+            "social awareness programs and community service for students."
+        ),
+        "ncc": (
+            "NCC National Cadet Corps trains cadets in army, navy and air force drills, "
+            "parade, and issues B and C certificates."
+        ),
+        "ieee": (
+            "The IEEE student chapter organizes technical symposiums, paper presentations "
+            "and seminars for electrical and electronics engineering students."
+        ),
+        "iei": (
+            "The IEI Institution of Engineers India student chapter is a professional body "
+            "offering membership and extracurricular technical activities."
+        ),
+        "adventure": (
+            "The adventure club organizes trekking, hiking, camping, outdoor expeditions "
+            "and rock climbing activities in nature."
+        ),
+        "women_cell": (
+            "The Women Development Cell promotes gender equality, ladies empowerment, "
+            "and handles harassment redressal for female students and staff."
+        ),
+        # ── administration categories ──
+        "principal": (
+            "The principal is the head of the institution. "
+            "The director, chairman and governing body manage top-level college administration."
+        ),
+        "accreditation": (
+            "The college has NBA, NAAC, NIRF rankings and AICTE GTU affiliation. "
+            "Accreditation grade, approval score and institutional ranking details."
+        ),
+        "awards": (
+            "The college has received awards and recognition at national and state level. "
+            "Students and faculty have achieved distinctions, trophies and rank honors."
+        ),
+    },
+    # ─────────────────────────────────────────────────
+    # TOPIC — aspects of a subject. Written as distinct
+    # question fragments to separate overlapping terms.
+    # ─────────────────────────────────────────────────
+    "topic": {
+        "faculty": (
+            "Who are the faculty members? I want to know about professors, lecturers, "
+            "HOD designation, teaching staff qualification and PhD details."
+        ),
+        "lab": (
+            "Where is the laboratory? I want to know about practical experiments, "
+            "equipment, instruments, workshop setup and apparatus in the lab."
+        ),
+        "syllabus": (
+            "What is the course syllabus? Show me the curriculum, subjects, units, "
+            "chapters and semester-wise GTU prescribed course content."
+        ),
+        "timetable": (
+            "What is the class timetable? I need the lecture schedule, period slots, "
+            "weekly routine and batch division timing."
+        ),
+        "event": (
+            "What events are coming up? Tell me about the fest, hackathon, seminar, "
+            "workshop, cultural program or technical competition organized."
+        ),
+        "project": (
+            "Tell me about student projects. I want to know about final year projects, "
+            "mini projects, SIH capstone work and dissertation submissions."
+        ),
+        "virtual_tour": (
+            "Can I take a virtual tour of the campus? "
+            "Show me the 360-degree online walkthrough, interactive map of rooms and infrastructure."
+        ),
+        "notice": (
+            "Are there any new notices? Show me the latest announcements, circulars, "
+            "bulletin board updates and recent notifications."
+        ),
+        "fees": (
+            "What are the fees? I want the tuition fee structure, semester charges, "
+            "fine, late fee, scholarship and payment due breakdown."
+        ),
+        "rules": (
+            "What are the rules? Tell me about college regulations, discipline policy, "
+            "code of conduct, dress code and behaviour guidelines."
+        ),
+        "facilities": (
+            "What facilities are available? Tell me about amenities like WiFi, mess, canteen, "
+            "gym, recreation areas and other campus services provided."
+        ),
+        "contact": (
+            "How do I contact them? I need the phone number, email address, "
+            "office location and person to reach at the department."
+        ),
+        "process": (
+            "What is the process to apply? Tell me the step-by-step procedure, "
+            "method, eligibility requirement and workflow sequence."
+        ),
+        "document": (
+            "What documents do I need? I need a bonafide certificate, migration form, "
+            "leaving certificate, TC or official attestation and verification."
+        ),
+        "route": (
+            "What is the bus route? Tell me the bus stop, pickup and drop point, "
+            "commute map, schedule and destination timing."
+        ),
+        "stats": (
+            "What are the statistics? Show me data, figures, total numbers, pass percentage, "
+            "ratio, achievements and placement records."
+        ),
+        "calendar": (
+            "What does the academic calendar look like? Show me exam dates, holidays, "
+            "semester start and end, and important event deadlines."
+        ),
+        "vision": (
+            "What is the vision and mission of the college? "
+            "Tell me the goals, objectives, values, motto and philosophy statement."
+        ),
+        "induction": (
+            "When is the induction program? Tell me about the orientation, welcome program, "
+            "freshman schedule, speaker list and new student activities."
+        ),
+    },
+    # ─────────────────────────────────────────────────
+    # INTENT — semantically distant action patterns.
+    # Use strong, distinct phrasing to avoid overlap.
+    # ─────────────────────────────────────────────────
+    "intent": {
+        "list": (
+            "List all available options. Show me every item. "
+            "Give me a complete enumeration. Display all choices."
+        ),
+        "count": (
+            "How many are there? What is the total count? "
+            "Tell me the number, quantity and strength."
+        ),
+        "detail": (
+            "What is this? Explain it to me. Describe and tell me about it. "
+            "I want an overview, summary and background information."
+        ),
+        "process": (
+            "How do I do this? What are the steps? "
+            "Guide me through the procedure, method and eligibility requirements."
+        ),
+        "greeting": (
+            "Hello! Hi, good morning, good evening. "
+            "How are you? Namaste. Hey, greetings to you."
+        ),
+    },
+}
+ABBREVIATIONS = {
+    # Departments
+    "ce": "computer civil engineering",
+    "cse": "computer science engineering",
+    "ds": "data science",
+    "it": "information technology",
+    "ict": "information communication technology",
+    "ece": "electronics communication engineering",
+    "ei": "electronics instrumentation engineering",
+    "ic": "instrumentation control",
+    "ee": "electrical engineering",
+    "pe": "power electronics",
+    "me": "mechanical master engineering",
+    "am": "applied mechanics",
+    "che": "chemical engineering",
+    "ch": "chemical",
+    # Degrees & Admission
+    "ug": "undergraduate",
+    "pg": "postgraduate",
+    "be": "bachelor engineering",
+    "btech": "bachelor technology",
+    "mtech": "master technology",
+    "mba": "master business administration",
+    "mca": "master computer applications",
+    # Organizations & Cells
+    "tnp": "training placement",
+    "nss": "national service scheme",
+    "ncc": "national cadet corps",
+    "ieee": "institute electrical electronics engineers",
+    "iei": "institution engineers india",
+    "wdc": "women development cell",
+    "sip": "student innovation project",
+    "gtu": "gujarat technological university",
+    "nba": "national board accreditation",
+    "naac": "national assessment accreditation council",
+    "nirf": "national institutional ranking framework",
+    "aicte": "all india council technical education",
+    "drdo": "defence research development organisation",
+    "isro": "indian space research organisation",
+    "dst": "department science technology",
+    # General
+    "sih": "smart india hackathon",
+    "lpa": "lakhs per annum",
+    "noc": "no objection certificate",
+    "tc": "transfer certificate",
+    "hod": "head department",
+    "phd": "doctor philosophy",
+    # Technical
+    "scada": "supervisory control data acquisition",
+    "plc": "programmable logic controller",
+    "lvdt": "linear variable differential transformer",
+    "mosfet": "metal oxide semiconductor field effect transistor",
+    "igbt": "insulated gate bipolar transistor",
+    "vlsi": "very large scale integration",
+    "cad": "computer aided design",
+    "cam": "computer aided manufacturing",
+    "erp": "enterprise resource planning",
+    "rf": "radio frequency",
+    "nlp": "natural language processing",
+    "ai": "artificial intelligence",
+    "ml": "machine learning",
+    "ip": "intellectual property",
+}
+MASTER_INDEX = {
+    "department": {
+        "categories": [
+            "applied_mechanics", "chemical", "civil", "computer", "cse_ds",
+            "electronics_comm", "electronics_inst", "electrical", "it", "ict",
+            "instrumentation", "mechanical", "power_electronics", "science_humanities"
+        ],
+        "topics": [
+            "faculty", "lab", "syllabus", "timetable", "event",
+            "project", "virtual_tour", "notice", "contact"
+        ]
+    },
+    "facility":   {"categories": [None], "topics": ["lab", "facilities", "contact", "virtual_tour"]},
+    "service":    {
+        "categories": ["transport", "finance", "medical", "sports", "grievance", "forms", "principal", "accreditation", "awards"],
+        "topics": ["fees", "rules", "process", "document", "route", "contact", "calendar", "stats", "vision", "induction"]
+    },
+    "hostel":     {"categories": [None], "topics": ["fees", "rules", "facilities", "contact"]},
+    "library":    {"categories": [None], "topics": ["fees", "rules", "facilities", "contact", "calendar", "document"]},
+    "placement":  {"categories": ["tnp"], "topics": ["stats", "process", "contact", "event"]},
+    "research":   {"categories": ["patent", "ssip", "funded", "publication"], "topics": ["stats", "project", "process", "contact"]},
+    "club":       {"categories": ["nss", "ncc", "ieee", "iei", "adventure", "women_cell"], "topics": ["event", "contact", "process", "stats", "notice"]},
+    "admission":  {"categories": ["ug", "pg"], "topics": ["process", "fees", "document", "result", "calendar", "contact"]}
+}
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import re
+from app.core.config import settings
+from typing import Optional
+from rank_bm25 import BM25Okapi
+FIELD_THRESHOLDS = {
+    "type": 0.25,      # Was 0.5 - too high for embedding-heavy field
+    "category": 0.5,   # Keep - BM25-heavy works well
+    "topic": 0.4,      # Was 0.5 - slight reduction
+    "intent": 0.5,     # Keep - usually clear signals
+}
+FIELD_WEIGHTS = {
+    "type":     (0.6, 0.4),   # embedding-heavy — semantic
+    "category": (0.35, 0.65), # BM25-heavy — exact names matter most
+    "topic":    (0.55, 0.45),
+    "intent":   (0.7, 0.3),   # embedding-heavy — semantic intent
+}
+class FilterClassifier:
+    def __init__(self, threshold=None):-==–
+        self.model = SentenceTransformer(str(settings.embeddings_path / "bge-small"))
+        self.anchor_embeddings = {}
+        self.bm25_classifiers = {}
+        self.anchor_keys = {}
+        self._build_anchor_embeddings()
+        self._build_bm25()
+        self.threshold = threshold
+    def _build_anchor_embeddings(self):
+        for field, anchors in ANCHORS.items():
+            self.anchor_embeddings[field] = {
+                label: self.model.encode(f"{label} {text}")
+                for label, text in anchors.items()
+            }
+    def _build_bm25(self):
+        for field, anchors in ANCHORS.items():
+            keys = list(anchors.keys())
+            docs = [f"{label} {text}".lower().split() for label, text in anchors.items()]
+            self.anchor_keys[field] = keys
+            self.bm25_classifiers[field] = BM25Okapi(docs)
+    def handle_abbreviations(self, query: str) -> str:
+        tokens = query.lower().split()
+        expanded = [ABBREVIATIONS.get(t, t) for t in tokens]
+        return " ".join(expanded)
+    def preprocess(self, query: str) -> str:
+        # Remove punctuation except spaces
+        query = re.sub(r'[^\w\s]', ' ', query.lower())
+        # Handle multiple spaces
+        query = re.sub(r'\s+', ' ', query).strip()
+        return query
+    def classify(self, query: str) -> dict:
+        query = self.handle_abbreviations(query)
+        query = self.preprocess(query)
+        query_emb = self.model.encode(query)
+        tokenized = query.lower().split()
+        result = {"type": None, "category": None, "topic": None, "year": None, "intent": None}
+        for field, value_embeddings in self.anchor_embeddings.items():
+            keys = self.anchor_keys[field]
+            # Embedding scores
+            emb_scores = {
+                val: cosine_similarity([query_emb], [emb])[0][0]
+                for val, emb in value_embeddings.items()
+            }
+            # BM25 scores — normalized to [0, 1]
+            raw_bm25 = self.bm25_classifiers[field].get_scores(tokenized)
+            bm25_max = max(raw_bm25) if max(raw_bm25) > 0 else 1
+            bm25_scores = {
+                keys[i]: raw_bm25[i] / bm25_max
+                for i in range(len(keys))
+            }
+            # Combine
+            emb_w, bm25_w = FIELD_WEIGHTS[field]
+            combined = {
+                val: emb_w * emb_scores[val] + bm25_w * bm25_scores.get(val, 0)
+                for val in keys
+            }
+            best_val = max(combined, key=combined.get)
+            best_score = combined[best_val]
+            threshold = FIELD_THRESHOLDS[field]
+            print(f"{field}: {best_val} | combined={best_score:.3f} emb={emb_scores[best_val]:.3f} bm25={bm25_scores.get(best_val,0):.3f}")
+            if best_score > threshold:
+                result[field] = best_val
+        year = re.search(r"\b(20\d{2})\b", query)
+        result["year"] = int(year.group()) if year else None
+        if result["type"] is not None:
+            if (result["category"] is None
+                    or result["category"] not in MASTER_INDEX[result["type"]]["categories"]):
+                result["category"] = None
+            if (result["topic"] is None
+                    or result["topic"] not in MASTER_INDEX[result["type"]]["topics"]):
+                result["topic"] = None
+        else:
+            result["category"] = None
+            result["topic"] = None
+        return result
+classifier = FilterClassifier()

app/services/hybrid_retrieval.py ADDED Viewed

	@@ -0,0 +1,354 @@

+"""
+Hybrid Retrieval Service
+========================
+Combines BM25 (keyword) and vector (semantic) retrieval using
+Reciprocal Rank Fusion (RRF) for stable, well-tested score merging.
+Strategy:
+  1. Run vector similarity search → fetches top-k candidates from ChromaDB.
+  2. Those same candidate documents become the BM25 corpus (no second DB call).
+  3. Fuse both ranked lists using RRF.
+  4. Apply an optional title-match boost post-fusion.
+  5. Return the top-k results.
+Why RRF instead of EnsembleRetriever?
+  - EnsembleRetriever depends on langchain_classic which is unstable.
+  - RRF is score-agnostic: it only uses rank order, so you never need to
+    normalise BM25 scores against cosine distances.
+  - It's the standard fusion method in production hybrid search systems
+    (used by Elasticsearch, Cohere, etc.).
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict
+from langchain_community.retrievers import BM25Retriever
+from langchain_core.documents import Document
+from app.utils.preprocessing import preprocess
+from app.services.classifier_service import clf
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+@dataclass
+class HybridRetrievalConfig:
+    """Centralised configuration for the hybrid retrieval pipeline."""
+    # Number of candidates each sub-retriever fetches before fusion
+    candidate_k: int = 15
+    # Final number of documents returned after fusion + reranking
+    top_k: int = 5
+    # Weights for fused score: bm25_weight + vector_weight should equal 1.0
+    bm25_weight: float = 0.6
+    vector_weight: float = 0.4
+    # RRF constant – larger k smooths rank differences (standard default: 60)
+    rrf_k: int = 60
+    # BM25 hyperparameters
+    bm25_k1: float = 1.5   # term frequency saturation
+    bm25_b: float = 0.5    # length normalisation
+    # Title-match boost: added to fused score for each query word found in title
+    title_boost_per_word: float = 0.1
+    # Minimum fused score to include a result (set to 0.0 to disable)
+    score_threshold: float = 0.0
+# ---------------------------------------------------------------------------
+# Result type
+# ---------------------------------------------------------------------------
+@dataclass
+class RetrievalResult:
+    """A single retrieved document with provenance scores."""
+    document: Document
+    fused_score: float
+    bm25_rank: Optional[int] = None    # rank in BM25 list (1-indexed), None if absent
+    vector_rank: Optional[int] = None  # rank in vector list (1-indexed), None if absent
+    title_boost: float = 0.0
+# ---------------------------------------------------------------------------
+# Core service
+# ---------------------------------------------------------------------------
+class HybridRetrievalService:
+    """
+    Hybrid retrieval that fuses BM25 and vector search results via RRF.
+    Usage
+    -----
+    ::
+        service = HybridRetrievalService(vector_db=rag.db)
+        results = service.retrieve(query="Faculties of Computer Department")
+        for r in results:
+            print(r.fused_score, r.document.page_content[:80])
+    """
+    def __init__(
+        self,
+        vector_db,
+        config: Optional[HybridRetrievalConfig] = None,
+    ):
+        """
+        Parameters
+        ----------
+        vector_db:
+            A LangChain-compatible vector store (e.g., Chroma instance from
+            ``RAGService.db``) that supports ``similarity_search_with_score``.
+        config:
+            Optional configuration object. Defaults to ``HybridRetrievalConfig()``.
+        """
+        self.vector_db = vector_db
+        self.cfg = config or HybridRetrievalConfig()
+        self.classifier = clf
+        self.raw_filters = {}
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def retrieve(
+        self,
+        query: str,
+    ) -> List[RetrievalResult]:
+        """
+        Run hybrid retrieval and return ranked results.
+        Vector search runs once to fetch the candidate pool from ChromaDB.
+        Those documents are immediately reused as the BM25 corpus, so there
+        is no redundant database call.
+        Parameters
+        ----------
+        query:
+            The raw user query (preprocessing is applied internally).
+        Returns
+        -------
+        List[RetrievalResult]
+            Top-k results sorted by descending fused score.
+        """
+        processed_query = self.classifier.expand_abbreviations(query)
+        print("Processed Query: ", processed_query)
+        # Step 1: Single vector search — produces both the ranking AND the candidate pool
+        vector_ranking = self._vector_rank(processed_query)
+        if not vector_ranking:
+            return []
+        # Step 2: Extract the candidate docs from the vector results for BM25
+        candidate_docs = []
+        for doc, _score, _rank in vector_ranking:
+            doc.metadata["original_content"] = doc.page_content
+            doc.page_content = preprocess(doc.page_content)
+            doc.page_content = doc.metadata['title'] + ": " + doc.page_content
+            candidate_docs.append(doc)
+        # Step 3: BM25 search over the same candidate pool (no extra DB call)
+        bm25_ranking = self._bm25_rank(processed_query, candidate_docs)
+        # Step 4: Fuse both rankings via RRF
+        fused = self._reciprocal_rank_fusion(bm25_ranking, vector_ranking)
+        filter_boost = {
+            "type": 1.10,
+            "category": 1.20,
+            "topic": 1.20,
+            "intent": 1.05
+        }
+        # Step 5: Boost scores based on filter confidence
+        for result in fused:
+            doc = result.document
+            for field in ["type", "category", "topic", "intent"]:
+                if field in doc.metadata:
+                    # Check if classifier was confident AND matched
+                    val = self.raw_filters.get(field)
+                    conf = self.raw_filters.get(f"{field}_conf", 0)
+                    # if val is not None and doc.metadata[field] == val and conf > 0.92 and field == "category":
+                    #     result.fused_score *= 1.25
+                    if val is not None and doc.metadata[field] == val and conf > 0.90 and field != "intent":
+                        result.fused_score *= filter_boost[field]
+                    elif val is not None and doc.metadata[field] == val and conf > 0.7:
+                        result.fused_score *= 1.05
+        boosted = self._apply_title_boost(fused, processed_query)
+        # Step 5: Filter, sort, and return top-k
+        for r in boosted:
+            r.fused_score = r.fused_score * 10
+            r.document.page_content = r.document.metadata["original_content"]
+            r.document.metadata["original_content"] = ""
+        results = [r for r in boosted if r.fused_score >= self.cfg.score_threshold]
+        results = sorted(results, key=lambda r: r.fused_score, reverse=True)
+        return results[: self.cfg.top_k]
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _bm25_rank(
+        self,
+        processed_query: str,
+        candidate_docs: List[Document],
+    ) -> List[tuple[Document, float, int]]:
+        """
+        Run BM25 over the candidate pool.
+        Returns a list of (document, raw_bm25_score, rank) tuples,
+        ordered by descending score (rank is 1-indexed).
+        """
+        retriever = BM25Retriever.from_documents(
+            candidate_docs,
+            bm25_params={"k1": self.cfg.bm25_k1, "b": self.cfg.bm25_b},
+        )
+        tokens = retriever.preprocess_func(processed_query)
+        raw_scores = retriever.vectorizer.get_scores(tokens)
+        # Pair each document with its BM25 score and sort descending
+        scored = sorted(
+            zip(retriever.docs, raw_scores),
+            key=lambda x: x[1],
+            reverse=True,
+        )
+        return [(doc, score, rank + 1) for rank, (doc, score) in enumerate(scored)]
+    def _vector_rank(
+        self,
+        processed_query: str,
+    ) -> List[tuple[Document, float, int]]:
+        """
+        Run vector similarity search against ChromaDB.
+        Returns a list of (document, similarity_score, rank) tuples,
+        ordered by descending similarity (rank is 1-indexed).
+        Chroma returns (document, distance); we convert to similarity = 1 - distance.
+        """
+        # SCORE_FALLBACK_THRESHOLD = 0.4
+        self.raw_filters = {}
+        filters = self.classifier.predict_with_filter([processed_query])
+        raw_filters = self.classifier.predict([processed_query])[0]
+        self.raw_filters = raw_filters
+        if filters:
+            raw_results = self.vector_db.similarity_search_with_score(
+                processed_query, k=self.cfg.candidate_k, filter=filters
+            )
+            # best_score = (1 - raw_results[0][1]) if raw_results else 0
+            # if not raw_results or best_score < SCORE_FALLBACK_THRESHOLD:
+            if not raw_results:
+                # print("FAILED UNDER THRESHOLD")
+                # print("*="*50)
+                # print("Query: ", processed_query)
+                # print("Filters: ", filters)
+                # print("Raw Results: ", raw_results)
+                # print("Best Score: ", best_score)
+                # print("*="*50)
+                raw_results = self.vector_db.similarity_search_with_score(
+                    processed_query, k=self.cfg.candidate_k
+                )
+        else:
+            raw_results = self.vector_db.similarity_search_with_score(
+                processed_query, k=self.cfg.candidate_k
+            )
+        ranked = []
+        for rank, (doc, distance) in enumerate(raw_results):
+            similarity = 1.0 - distance
+            ranked.append((doc, similarity, rank + 1))
+        return ranked
+    def _reciprocal_rank_fusion(
+        self,
+        bm25_ranking: List[tuple[Document, float, int]],
+        vector_ranking: List[tuple[Document, float, int]],
+    ) -> List[RetrievalResult]:
+        """
+        Fuse two ranked lists using Reciprocal Rank Fusion (RRF).
+        RRF score for a document d:
+            score(d) = w_bm25 * 1/(k + rank_bm25(d))
+                     + w_vec  * 1/(k + rank_vec(d))
+        Documents not present in a list are simply omitted from that term.
+        We use page_content as the deduplication key (consistent with how
+        BM25Retriever stores docs).
+        """
+        rrf_k = self.cfg.rrf_k
+        # Build lookup: content_key -> RetrievalResult
+        fused: Dict[str, RetrievalResult] = {}
+        def content_key(doc: Document) -> str:
+            # Use a short hash of content for stable keying
+            return doc.page_content
+        # --- BM25 contribution ---
+        for doc, _raw_score, rank in bm25_ranking:
+            key = content_key(doc)
+            contribution = self.cfg.bm25_weight * (1.0 / (rrf_k + rank))
+            if key not in fused:
+                fused[key] = RetrievalResult(document=doc, fused_score=0.0)
+            fused[key].fused_score += contribution
+            fused[key].bm25_rank = rank
+        # --- Vector contribution ---
+        for doc, _similarity, rank in vector_ranking:
+            key = content_key(doc)
+            contribution = self.cfg.vector_weight * (1.0 / (rrf_k + rank))
+            if key not in fused:
+                fused[key] = RetrievalResult(document=doc, fused_score=0.0)
+            fused[key].fused_score += contribution
+            fused[key].vector_rank = rank
+        return list(fused.values())
+    def _apply_title_boost(
+        self,
+        results: List[RetrievalResult],
+        processed_query: str,
+    ) -> List[RetrievalResult]:
+        """
+        Boost fused score for documents whose title contains query words.
+        Each matching word adds ``cfg.title_boost_per_word`` to the score.
+        This is a lightweight, interpretable re-ranking step that rewards
+        exact title hits without overriding semantic relevance entirely.
+        """
+        query_words = set(processed_query.lower().split())
+        for result in results:
+            title = preprocess(result.document.metadata.get("title", "").lower())
+            if not title:
+                continue
+            boost = sum(
+                self.cfg.title_boost_per_word
+                for word in query_words
+                if word and word in title
+            )
+            result.title_boost = boost
+            result.fused_score += boost
+        return results

app/services/ingestion_service.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from pathlib import Path
+from app.services.file_service import file_service
+from app.services.text_splitter import TextSplitter
+from langchain_core.documents import Document
+from app.utils.preprocessing import normalize
+import json
+from typing import List
+from fastapi import HTTPException
+class IngestionService:
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.text_splitter = TextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+    def load_file(self, file_path: Path):
+        # read file
+        document = file_service.read_file(file_path)
+        metadata = document.metadata
+        #save_file
+        file_service.write_file(file_path, document.page_content, metadata)
+        #handle_file_chunks
+        if metadata["ext"] == "json":
+            return self.handle_json_docs(document, metadata)
+        else:
+            return self.handle_text_docs(document, file_path, metadata)
+    def ingest(self, file_path: Path):
+        documents = self.load_file(file_path)
+        return documents
+    def get_records(self):
+        return file_service.get_records()
+    def delete_record(self, filename: str):
+        return file_service.delete_record(filename)
+    def path_record(self, file_path: Path, metadata:dict):
+        file_service.patch_metadata(file_path, metadata)
+    def handle_json_docs(self, document: Document, metadata: dict) -> List[Document]:
+        docs = []
+        json_data = json.loads(document.page_content)
+        count = 0
+        # content: { key: { list: [], detail: text }, key: { list: [], detail: text } }
+        for key,value in json_data.items():
+            for intent, intent_content in value.items():
+                if intent == "list":
+                    chunk = ""
+                    for idx, item in enumerate(intent_content or []):
+                        if(item.strip() == ""):
+                            continue
+                        chunk += f"{idx+1}. {item.strip()}\n"
+                    if(chunk):
+                        docs.append(Document(page_content=chunk, metadata={**metadata, "topic": key, "intent": intent, "chunk_index": count}))
+                        count += 1
+                        if(len(intent_content) > 0):
+                            docs.append(Document(page_content=f"Total {key}: {len(intent_content)}", metadata={**metadata, "topic": key, "intent": "count", "chunk_index": count}))
+                            count += 1
+                elif intent == "detail" or intent == "details":
+                    if(intent_content.strip() == ""):
+                        continue
+                    chunk = f"{intent_content.strip()}"
+                    docs.append(Document(page_content=chunk, metadata={**metadata, "topic": key, "intent": "detail", "chunk_index": count}))
+                    count += 1
+        return docs
+    def handle_text_docs(self, document: Document, file_path: Path, metadata: dict) -> List[Document]:
+        docs = []
+        # split document into chunks
+        documents = self.text_splitter.split_documents([document])
+        # create preprocess document texts
+        for idx, doc in enumerate(documents):
+            # Create a copy to avoid modifying original during iteration
+            new_doc = Document(
+                page_content=normalize(doc.page_content),
+                metadata={
+                    **doc.metadata,
+                    **metadata,
+                    "source": file_path.name,
+                    "chunk_index": idx
+                }
+            )
+            # Check normalized content has actual text
+            if len(new_doc.page_content.strip()) > 0:
+                docs.append(new_doc)
+        return docs  # Return the processed list, not final_docs
+ingestion_service = IngestionService()

app/services/rag_service.py ADDED Viewed

	@@ -0,0 +1,483 @@

+from pathlib import Path
+import time
+# LangChain Imports
+from fastapi import HTTPException
+from langchain_core.prompts import  PromptTemplate
+from langchain_chroma import Chroma
+from langchain.messages import HumanMessage, AIMessage, SystemMessage
+from typing import Optional, List
+from app.api.schemas.tests import TestRequestSchema, TestClassifierReqSchema
+from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_score, recall_score
+# Locals
+from app.services.text_splitter import TextSplitter
+from app.services.vector_store import VectorStore
+from app.utils.preprocessing import normalize, preprocess_documents, preprocess_query
+from app.utils.document_helpers import get_references, create_documents, build_metadata, get_references_v2
+from app.prompts import SYSTEM_PROMPT, wrap_exaone
+from app.core.config import settings
+from app.services.hybrid_retrieval import HybridRetrievalService, HybridRetrievalConfig
+from app.services.ingestion_service import IngestionService
+from app.services.classifier_service import clf
+def format_history(history: list[str]) -> str:
+    formatted = []
+    for i, msg in enumerate(history):
+        role = "User" if i % 2 == 0 else "Assistant"
+        formatted.append(f"{role}: {msg}")
+    return "\n".join(formatted)
+class RAGService:
+    def __init__(self,
+        model,
+        collection_name: str = None,
+        persist_directory: str = None,
+        embedding_model = None,
+        k: int = None):
+        # initialize Models
+        self.model = model
+        self.embedding_model = embedding_model
+        self.collection_name = collection_name or settings.collection_name
+        self.k = k or settings.similarity_top_k
+        self.persist_directory = persist_directory or settings.persist_directory
+        self.evaluation = {}
+        #setup vector database
+        self.db = Chroma(
+            collection_name=self.collection_name,
+            embedding_function=self.embedding_model,
+            persist_directory=self.persist_directory
+        )
+        self.database = VectorStore(self.db)
+        self.text_splitter = TextSplitter()
+        # document_loader
+        # self.doc_loader = document_loader(filepath=self.filepath)
+        self.template = PromptTemplate.from_template(SYSTEM_PROMPT)
+        self.retriever = self.db.as_retriever(search_type="similarity", search_kwargs={"k": self.k})
+    def get_filenames(self):
+        ingestion_service = IngestionService()
+        return ingestion_service.get_records()
+    def ingest_documents(self, filepath: str, chunk_size: int = None, chunk_overlap: int = None):
+        start = time.time()
+        path = Path(filepath)
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {filepath}")
+        chunk_size = chunk_size or settings.chunk_size
+        chunk_overlap = chunk_overlap or settings.chunk_overlap
+        ingestion_service = IngestionService(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+        chunks = ingestion_service.ingest(path)
+        print("Chunks: ", chunks)
+        if chunks is None or len(chunks) == 0:
+            raise HTTPException(
+                status_code=400,
+                detail=f"No text content found in '{path.name}'. The file may be image-based or empty."
+            )
+        if len(chunks) == 1 and chunks[0].page_content.strip() == "":
+            raise HTTPException(
+                status_code=400,
+                detail=f"Document '{path.name}' contains empty or unreadable content."
+            )
+        size_bytes = path.stat().st_size
+        size_mb = (size_bytes / (1024 * 1024))
+        # Add to database
+        self.database.add_documents(chunks)
+        dim_bytes_with_chunks = (768 * 4) + chunk_size
+        dim_bytes = dim_bytes_with_chunks * len(chunks)
+        Estimated_DB_MB = dim_bytes / (1024 * 1024)
+        ingestion_service.path_record(file_path=path, metadata={
+            "doc_chunks": len(chunks),
+            "chunk_size": chunk_size,
+            "chunk_overlap": chunk_overlap,
+            "execution_time": time.time() - start,
+            "file_size": size_mb,
+            "db_size": Estimated_DB_MB
+        })
+        print(f"✅ Added {len(chunks)} chunks from {path.name} to vector store")
+        return chunks
+    def query(
+        self,
+        question: str,
+        history: List[str] = None,
+        k: int = None,
+        threshold: float = None,
+        include_llm_response: bool = True
+    ):
+        """
+        Unified search method with optional LLM response and threshold filtering.
+        Args:
+            question: User's question
+            history: Conversation history (optional)
+            k: Number of documents to retrieve (defaults to settings)
+            threshold: Similarity threshold filter (defaults to settings)
+            include_llm_response: Whether to generate LLM answer (default: True)
+        Returns:
+            dict with 'answer' (if include_llm_response), 'references', 'context'
+        """
+        if not question or len(question.strip()) == 0:
+            return {
+                "answer": "Please provide a valid question",
+                "references": [],
+                "context": ""
+            }
+        # Use defaults from settings
+        k = k or self.k or settings.similarity_top_k
+        threshold = threshold if threshold is not None else settings.similarity_threshold
+        history = history or []
+        # Normalize and search
+        question = preprocess_query(question)
+        docs = self.database.similarity_search_with_score(query=question, k=k)
+        # print(question)
+        # Get references and context
+        ctx = get_references(docs, threshold=threshold)
+        documents = ctx.get('documents', [])
+        context = ctx.get('context', '')
+        # Filter by threshold
+        filtered_docs = documents
+        # If no docs pass threshold
+        if not filtered_docs:
+            return {
+                "answer": "No relevant documents found matching the similarity threshold",
+                "references": [],
+                "context": "",
+                "threshold_used": threshold
+            }
+        # Generate LLM response if requested
+        if include_llm_response:
+            formatted_history = format_history(history)
+            prompt = self.template.invoke({
+                "history": formatted_history,
+                "question": question,
+                "context": context
+            })
+            response = self.model.invoke(prompt)
+            answer = response.content
+        else:
+            answer = ""  # Just return context without LLM
+            context = ""
+        return {
+            "answer": answer,
+            "references": filtered_docs,
+            "context": context,
+            "threshold_used": threshold,
+            "k_used": k
+        }
+    def hybrid_query(
+        self,
+        question: str,
+        history: List[str] = None,
+        k: int = None,
+        threshold: float = None,
+        include_llm_response: bool = True
+    ):
+        """
+        Unified search method with optional LLM response and threshold filtering.
+        Args:
+            question: User's question
+            history: Conversation history (optional)
+            k: Number of documents to retrieve (defaults to settings)
+            threshold: Similarity threshold filter (defaults to settings)
+            include_llm_response: Whether to generate LLM answer (default: True)
+        Returns:
+            dict with 'answer' (if include_llm_response), 'references', 'context'
+        """
+        if not question or len(question.strip()) == 0:
+            return {
+                "answer": "Please provide a valid question",
+                "references": [],
+                "context": ""
+            }
+        # Use defaults from settings
+        threshold = threshold if threshold is not None else settings.similarity_threshold
+        history = history or []
+        query = question
+        candidate_k = 15   # how many docs vector search fetches (also the BM25 pool size)
+        final_k = k or settings.similarity_top_k        # how many results to return after fusion
+        config = HybridRetrievalConfig(
+            candidate_k=candidate_k,
+            top_k=final_k,
+            bm25_weight=0.45,
+            vector_weight=0.55,
+            rrf_k=20,
+            bm25_k1=1.2,
+            bm25_b=0.9,
+            title_boost_per_word=0.004,
+            score_threshold=threshold,
+        )
+        service = HybridRetrievalService(vector_db=self.db, config=config)
+        docs = service.retrieve(query=query)
+        ctx = get_references_v2(docs, threshold=threshold)
+        documents = ctx.get('documents', [])
+        context = ctx.get('context', 'No context available')
+        filtered_docs = documents
+        print("*"*50)
+        print("context: ", context)
+        print("*"*50)
+        if not filtered_docs:
+            return {
+                "answer": "No relevant documents found matching the similarity threshold",
+                "references": [],
+                "context": "",
+                "threshold_used": threshold
+            }
+        if include_llm_response:
+            formatted_history = format_history(history)
+            prompt = self.template.invoke({
+                "history": formatted_history,
+                "question": question,
+                "context": context
+            })
+            # if settings.local_model_name == "EXAONE-3.5-2.4B-Instruct-Q4_K_M.gguf":
+            #     prompt = wrap_exaone(prompt)
+            response = self.model.invoke(prompt)
+            answer = response.content if hasattr(response, "content") else response
+        else:
+            answer = ""  # Just return context without LLM
+            context = ""
+        return {
+            "answer": answer,
+            "references": filtered_docs,
+            "context": context,
+            "threshold_used": threshold,
+            "k_used": k
+        }
+    def search_docs(
+        self,
+        question: str,
+        k: int = 10,
+        filename: str = None
+    ):
+        """
+        Unified search method with optional LLM response and threshold filtering.
+        Args:
+            question: User's question
+            k: Number of documents to retrieve (defaults to settings)
+        Returns:
+            dict with 'answer' (if include_llm_response), 'references', 'context'
+        """
+        if not question or len(question.strip()) == 0:
+            return {
+                "answer": "Please provide a valid question",
+                "references": [],
+                "context": ""
+            }
+        # Use defaults from settings
+        threshold = settings.similarity_threshold
+        query = question
+        candidate_k = 15   # how many docs vector search fetches (also the BM25 pool size)
+        final_k = k or settings.similarity_top_k        # how many results to return after fusion
+        config = HybridRetrievalConfig(
+            candidate_k=candidate_k,
+            top_k=final_k,
+            bm25_weight=0.7,
+            vector_weight=0.3,
+            rrf_k=20,
+            bm25_k1=1.5,
+            bm25_b=0.75,
+            title_boost_per_word=0.004,
+            score_threshold=threshold,
+        )
+        service = HybridRetrievalService(vector_db=self.db, config=config)
+        docs = service.retrieve(query=query)
+        results = []
+        for doc in docs:
+            results.append({
+                "id": doc.document.id,
+                "content": doc.document.page_content,
+                "metadata": doc.document.metadata,
+                "score": doc.fused_score
+            })
+        if(filename):
+            results = [doc for doc in results if doc["metadata"]["source_file"] == filename]
+        return results
+    def test_queries(self, tests: TestRequestSchema, query_delay: float = 1.0):
+        """
+        query_delay: seconds to wait between queries.
+        Gemini free tier allows 100 embedding RPM → safe delay = 1.0s.
+        For 150 queries: ~2.5 min total.
+        """
+        results = []
+        k = tests.k
+        threshold = tests.threshold
+        for idx, test in enumerate(tests.tests):
+            question = test.question
+            document = test.document
+            chunk_index = test.chunk_index
+            response = self.hybrid_query(
+                question=question,
+                history=[],
+                k=k,
+                threshold=threshold,
+                include_llm_response=False,
+            )
+            # Respect Gemini embedding rate limit (100 RPM free tier)
+            if query_delay > 0 and idx < len(tests.tests) - 1:
+                print(f"[test_queries] {idx + 1}/{len(tests.tests)} done — sleeping {query_delay}s")
+                time.sleep(query_delay)
+            # print("*"*50)
+            # print(response)
+            # print("*"*50)
+            ans_found = False
+            ans = {"tests": test}
+            correct_source_chunks = 0
+            len_all_docs = len(response.get("references", []))
+            rank = None
+            for idx, ref in enumerate(response.get("references", [])):
+                if ref.get("source") == document:
+                    correct_source_chunks += 1
+                if (
+                    ref.get("source") == document
+                    and ref.get("chunk_index") == chunk_index
+                    and rank is None
+                ):
+                    ans_found = True
+                    rank = idx + 1
+            wrong_source_chunks = len_all_docs - correct_source_chunks
+            if len_all_docs > 0:
+                doc_precision = correct_source_chunks / len_all_docs
+                doc_noise = wrong_source_chunks / len_all_docs
+            else:
+                doc_precision = 0
+                doc_noise = 0
+            doc_recall = 1 if correct_source_chunks > 0 else 0
+            doc_error = 1 - doc_recall
+            if rank is not None:
+                mrr = 1 / rank
+            else:
+                mrr = 0
+            ans["answer"] = ans_found
+            ans["correct_source_chunks"] = correct_source_chunks
+            ans["wrong_source_chunks"] = wrong_source_chunks
+            ans["doc_precision"] = doc_precision
+            ans["doc_recall"] = doc_recall
+            ans["doc_error"] = doc_error
+            ans["mrr"] = mrr
+            ans["top_1_hit"] = 1 if rank == 1 else 0
+            ans["doc_noise"] = doc_noise
+            results.append(ans)
+        avg_doc_precision = sum([r["doc_precision"] for r in results]) / len(results)
+        avg_doc_recall = sum([r["doc_recall"] for r in results]) / len(results)
+        avg_mrr = sum([r["mrr"] for r in results]) / len(results)
+        hit_rate = sum([1 for r in results if r["answer"]]) / len(results)
+        top_1_hit_rate = sum([r["top_1_hit"] for r in results]) / len(results)
+        avg_doc_noise = sum([r["doc_noise"] for r in results]) / len(results)
+        error_rate = 1 - hit_rate
+        avg_doc_error = sum([r["doc_error"] for r in results]) / len(results)
+        return {
+            "results": results,
+            "avg_doc_precision": avg_doc_precision,
+            "avg_doc_recall": avg_doc_recall,
+            "avg_mrr": avg_mrr,
+            "hit_rate": hit_rate,
+            "top_1_hit_rate": top_1_hit_rate,
+            "avg_doc_noise": avg_doc_noise,
+            "error_rate": error_rate,
+            "avg_doc_error": avg_doc_error
+        }
+    def test_classifier(self, tests: TestClassifierReqSchema):
+        queries = [test.question for test in tests.tests]
+        result = clf.predict(queries)
+        fields = ["type", "category", "topic", "intent"]
+        evaluation = {}
+        for field in fields:
+            y_true = [getattr(t, field) if getattr(t, field) else "general" for t in tests.tests]
+            y_pred = [r[field] if r[field] else "general" for r in result]
+            evaluation[field] = {
+                "accuracy": accuracy_score(y_true, y_pred),
+                "precision": precision_score(y_true, y_pred, average="macro", zero_division=0),
+                "recall": recall_score(y_true, y_pred, average="macro", zero_division=0),
+                "f1_macro": f1_score(y_true, y_pred, average="macro", zero_division=0),
+                "f1_weighted": f1_score(y_true, y_pred, average="weighted", zero_division=0),
+                "classification_report": classification_report(y_true, y_pred, zero_division=0, output_dict=True)
+            }
+        return {
+            "evaluation": evaluation,
+            "results": result
+        }
+    def delete_database(self):
+        self.database.db.delete_collection()
+    # to close the model on destruction
+    def model_close(self):
+        client = getattr(self.model, "client", None)
+        if not client:
+            return
+        if hasattr(client, "close"):
+            client.close()
+        elif hasattr(client, "aclose"):
+            import asyncio
+            asyncio.run(client.aclose())
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.model_close()

app/services/text_splitter.py ADDED Viewed

	@@ -0,0 +1,266 @@

+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+from typing import List, Optional, Literal
+from app.core.config import settings
+class TextSplitter:
+    """
+    A service class for splitting documents into smaller chunks using recursive character text splitting.
+    This class provides flexible text splitting capabilities with support for different document types
+    and customizable chunk sizes and overlaps.
+    """
+    def __init__(
+        self,
+        chunk_size: int = None,
+        chunk_overlap: int = None,
+        length_function: callable = len,
+        is_separator_regex: bool = False,
+        separators: Optional[List[str]] = None,
+        keep_separator: bool = True
+    ):
+        """
+        Initialize the TextSplitter with configurable parameters.
+        Args:
+            chunk_size: Maximum size of chunks to return (default: from settings)
+            chunk_overlap: Overlap in characters between chunks (default: from settings)
+            length_function: Function to measure chunk length (default: len)
+            is_separator_regex: Whether separators are regex patterns (default: False)
+            separators: List of separators to split on (default: None, uses default separators)
+            keep_separator: Whether to keep separators in chunks (default: True)
+        """
+        # Use settings as defaults
+        self.chunk_size = chunk_size if chunk_size is not None else settings.chunk_size
+        self.chunk_overlap = chunk_overlap if chunk_overlap is not None else settings.chunk_overlap
+        self.length_function = length_function
+        self.is_separator_regex = is_separator_regex
+        self.keep_separator = keep_separator
+        # Use custom separators if provided, otherwise use default
+        self.separators = separators if separators is not None else [
+            "\n\n",  # Double newline (paragraphs)
+            "\n",    # Single newline
+            " ",     # Space
+            ""       # Character-level split as last resort
+        ]
+        self._initialize_splitter()
+    def _initialize_splitter(self):
+        """Initialize the RecursiveCharacterTextSplitter with current settings."""
+        self.splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+            length_function=self.length_function,
+            is_separator_regex=self.is_separator_regex,
+            separators=self.separators,
+            keep_separator=self.keep_separator
+        )
+    def split_documents(self, documents: List[Document]) -> List[Document]:
+        """
+        Split a list of documents into smaller chunks.
+        Args:
+            documents: List of Document objects to split
+        Returns:
+            List of Document objects representing the chunks
+        """
+        return self.splitter.split_documents(documents)
+    def split_text(self, text: str) -> List[str]:
+        """
+        Split a single text string into smaller chunks.
+        Args:
+            text: Text string to split
+        Returns:
+            List of text chunks
+        """
+        return self.splitter.split_text(text)
+    def create_document(
+            self,
+            text: str,
+            metadata: dict
+    ):
+        return Document(page_content=text, metadata=metadata)
+    def create_documents(
+        self,
+        texts: List[str],
+        metadatas: Optional[List[dict]] = None
+    ) -> List[Document]:
+        """
+        Create Document objects from texts and split them into chunks.
+        Args:
+            texts: List of text strings to convert to documents
+            metadatas: Optional list of metadata dictionaries for each text
+        Returns:
+            List of Document objects representing the chunks
+        """
+        return self.splitter.create_documents(texts, metadatas)
+    def update_settings(
+        self,
+        chunk_size: Optional[int] = None,
+        chunk_overlap: Optional[int] = None,
+        separators: Optional[List[str]] = None
+    ):
+        """
+        Update splitter settings and reinitialize.
+        Args:
+            chunk_size: New chunk size (optional)
+            chunk_overlap: New chunk overlap (optional)
+            separators: New separators list (optional)
+        """
+        if chunk_size is not None:
+            self.chunk_size = chunk_size
+        if chunk_overlap is not None:
+            self.chunk_overlap = chunk_overlap
+        if separators is not None:
+            self.separators = separators
+        self._initialize_splitter()
+    @classmethod
+    def from_language(
+        cls,
+        language: Literal[
+            "cpp", "go", "java", "kotlin", "js", "ts", "php", "proto",
+            "python", "rst", "ruby", "rust", "scala", "swift", "markdown",
+            "latex", "html", "sol", "csharp", "cobol", "c", "lua", "perl"
+        ],
+        chunk_size: int = None,
+        chunk_overlap: int = None
+    ) -> 'TextSplitter':
+        """
+        Create a TextSplitter optimized for a specific programming language or format.
+        Args:
+            language: Programming language or format type
+            chunk_size: Maximum size of chunks to return (default: from settings)
+            chunk_overlap: Overlap in characters between chunks (default: from settings)
+        Returns:
+            TextSplitter instance configured for the specified language
+        """
+        # Use settings as defaults
+        chunk_size = chunk_size if chunk_size is not None else settings.chunk_size
+        chunk_overlap = chunk_overlap if chunk_overlap is not None else settings.chunk_overlap
+        splitter = RecursiveCharacterTextSplitter.from_language(
+            language=language,
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap
+        )
+        instance = cls(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+        instance.splitter = splitter
+        return instance
+    @classmethod
+    def for_markdown(cls, chunk_size: int = None, chunk_overlap: int = None) -> 'TextSplitter':
+        """
+        Create a TextSplitter optimized for Markdown documents.
+        Args:
+            chunk_size: Maximum size of chunks to return (default: from settings)
+            chunk_overlap: Overlap in characters between chunks (default: from settings)
+        Returns:
+            TextSplitter instance configured for Markdown
+        """
+        return cls.from_language("markdown", chunk_size, chunk_overlap)
+    @classmethod
+    def for_code(
+        cls,
+        language: str = "python",
+        chunk_size: int = None,
+        chunk_overlap: int = None
+    ) -> 'TextSplitter':
+        """
+        Create a TextSplitter optimized for code documents.
+        Args:
+            language: Programming language (default: "python")
+            chunk_size: Maximum size of chunks to return (default: from settings)
+            chunk_overlap: Overlap in characters between chunks (default: from settings)
+        Returns:
+            TextSplitter instance configured for code
+        """
+        return cls.from_language(language, chunk_size, chunk_overlap)
+    @classmethod
+    def for_markdown_with_sections(
+        cls,
+        chunk_size: int = None,
+        chunk_overlap: int = None
+    ) -> 'TextSplitter':
+        """
+        Create a TextSplitter optimized for Markdown with section delimiters (---).
+        This splitter is designed for markdown files that use '---' as section separators
+        (common in frontmatter/multi-section documents). It prioritizes keeping sections
+        together and prevents splitting on headers, which reduces the number of small chunks.
+        Args:
+            chunk_size: Maximum size of chunks to return (default: from settings)
+            chunk_overlap: Overlap in characters between chunks (default: from settings)
+        Returns:
+            TextSplitter instance with custom separators for sectioned markdown
+        """
+        # Use settings as defaults
+        chunk_size = chunk_size if chunk_size is not None else settings.chunk_size
+        chunk_overlap = chunk_overlap if chunk_overlap is not None else settings.chunk_overlap
+        # Custom separators that respect section boundaries
+        # Priority: sections -> paragraphs -> sentences -> words -> characters
+        custom_separators = [
+            "---",      # Section delimiter with newlines
+            ". ",           # Sentences
+            " ",            # Words
+        ]
+        return cls(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separators=custom_separators,
+            keep_separator=True  # Keep separators to maintain structure
+        )
+    def get_chunk_info(self, documents: List[Document]) -> dict:
+        """
+        Get information about how documents will be split.
+        Args:
+            documents: List of documents to analyze
+        Returns:
+            Dictionary containing chunk statistics
+        """
+        chunks = self.split_documents(documents)
+        chunk_sizes = [len(chunk.page_content) for chunk in chunks]
+        return {
+            "total_documents": len(documents),
+            "total_chunks": len(chunks),
+            "average_chunk_size": sum(chunk_sizes) / len(chunk_sizes) if chunk_sizes else 0,
+            "min_chunk_size": min(chunk_sizes) if chunk_sizes else 0,
+            "max_chunk_size": max(chunk_sizes) if chunk_sizes else 0,
+            "configured_chunk_size": self.chunk_size,
+            "configured_overlap": self.chunk_overlap
+        }
+text_splitter = TextSplitter()

app/services/vector_store.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from langchain_core.documents import Document
+from typing import List, Optional
+import json
+class VectorStore:
+  def __init__(self, db):
+    self.db = db
+  def get(self):
+    return self.db.get()
+  def get_by_id(self, ids: list[str]):
+    return self.db.get(ids=ids)
+  def get_dict(self):
+    data = self.db.get()
+    rows = [
+      {
+        "id": id_,
+        "document": doc,
+        "metadata": meta,
+      }
+      for id_, doc, meta in zip(
+        data["ids"],
+        data["documents"],
+        data["metadatas"],
+      )
+    ]
+    print(type(rows))
+    return json.dumps(rows)
+  def similarity_search(self, query: str, filter: Optional[dict[str, str]] = None, k: Optional[int] = 5):
+    results = []
+    if(filter):
+      results = self.db.similarity_search(query, filter, k)
+    else:
+      results = self.db.similarity_search(query, k)
+    return results
+  def similarity_search_with_score(self, query: str, filter: Optional[dict[str, str]] = None, k: Optional[int] = 5):
+    results = []
+    if(filter):
+      results = self.db.similarity_search_with_score(query, filter, k)
+    else:
+      results = self.db.similarity_search_with_score(query, k)
+    return results
+  def add_documents(self, docs: List[Document], ids: Optional[List] = None):
+    result = []
+    final_docs = [doc for doc in docs if doc.page_content.strip()]
+    if(ids is not None):
+      result = self.db.add_documents(final_docs,ids)
+    else:
+      result = self.db.add_documents(final_docs)
+    return result
+  def update_document(self, document_id: str, document: Document):
+    # safest + guaranteed re-embedding
+    self.db.delete(ids=[document_id])
+    return self.db.add_documents([document], ids=[document_id])
+  def delete(self, ids: List):
+    self.db.delete(ids = ids)
+    return True

app/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# from .preprocessing import preprocess, normalize, preprocess_documents, preprocess_query
+# from .constants import stopwords
+# from .document_helpers import get_references, create_document, create_documents, build_metadata, clean_metadata,load_json, read_json_file, get_references_v2
+# from .llm_models import load_model
+# from .model_factory import get_embedding_model, get_llm_model, get_local_model, get_gemini_model

app/utils/constants.py ADDED Viewed

	@@ -0,0 +1,264 @@

+short_words_mappings = {
+    "IT": "Information Technology",
+    "BT": "Biotechnology",
+    "ECE": "Electronics and Communication Engineering",
+    "CE": "Computer Engineering",
+    "dept": "Department",
+    "ICT": "Information and Communication Technology",
+    "DS": "Data Science",
+    "CS": "Computer Science",
+    "CSE": "Computer Science and Engineering",
+    "MCA": "Master of Computer Application",
+    "MSc": "Master of Science",
+}
+stopwords = """
+a
+an
+the
+but
+if
+then
+else
+because
+so
+of
+to
+from
+in
+on
+at
+by
+for
+with
+about
+into
+over
+under
+between
+after
+before
+during
+through
+above
+below
+up
+down
+out
+off
+again
+further
+once
+only
+some
+any
+each
+few
+more
+most
+other
+such
+very
+"""
+# stopwords = """
+# a
+# about
+# above
+# after
+# again
+# against
+# ain
+# all
+# am
+# an
+# and
+# any
+# are
+# aren
+# aren't
+# as
+# at
+# be
+# because
+# been
+# before
+# being
+# below
+# between
+# both
+# but
+# by
+# can
+# couldn
+# couldn't
+# d
+# did
+# didn
+# didn't
+# do
+# does
+# doesn
+# doesn't
+# doing
+# don
+# don't
+# down
+# during
+# each
+# few
+# for
+# from
+# further
+# had
+# hadn
+# hadn't
+# has
+# hasn
+# hasn't
+# have
+# haven
+# haven't
+# having
+# he
+# he'd
+# he'll
+# he's
+# her
+# here
+# hers
+# herself
+# him
+# himself
+# his
+# how
+# i
+# i'd
+# i'll
+# i'm
+# i've
+# if
+# in
+# into
+# is
+# isn
+# isn't
+# it
+# it'd
+# it'll
+# it's
+# its
+# itself
+# just
+# ll
+# m
+# ma
+# me
+# mightn
+# mightn't
+# more
+# most
+# mustn
+# mustn't
+# my
+# myself
+# needn
+# needn't
+# no
+# nor
+# not
+# now
+# o
+# of
+# off
+# on
+# once
+# only
+# or
+# other
+# our
+# ours
+# ourselves
+# out
+# over
+# own
+# re
+# s
+# same
+# shan
+# shan't
+# she
+# she'd
+# she'll
+# she's
+# should
+# should've
+# shouldn
+# shouldn't
+# so
+# some
+# such
+# t
+# than
+# that
+# that'll
+# the
+# their
+# theirs
+# them
+# themselves
+# then
+# there
+# these
+# they
+# they'd
+# they'll
+# they're
+# they've
+# this
+# those
+# through
+# to
+# too
+# under
+# until
+# up
+# ve
+# very
+# was
+# wasn
+# wasn't
+# we
+# we'd
+# we'll
+# we're
+# we've
+# were
+# weren
+# weren't
+# what
+# when
+# where
+# which
+# while
+# who
+# whom
+# why
+# will
+# with
+# won
+# won't
+# wouldn
+# wouldn't
+# y
+# you
+# you'd
+# you'll
+# you're
+# you've
+# your
+# yours
+# yourself
+# yourselves
+# """

app/utils/document_helpers.py ADDED Viewed

	@@ -0,0 +1,231 @@

+from langchain_core.documents import Document
+from pathlib import Path
+from typing import Optional, List
+from datetime import datetime, date
+import uuid
+import yaml
+from app.services.text_splitter import TextSplitter
+import json
+# Allowed types for metadata cleaning
+ALLOWED = (str, int, float, bool, list, type(None))
+def get_references_v2(docs, threshold: float):
+    results = []
+    context = ""
+    for doc in docs:
+        _doc = doc.document
+        _similarity = doc.fused_score
+        # print(_similarity, threshold)
+        if _similarity < threshold:
+            continue
+        metadata = _doc.metadata
+        document = {
+            "title": metadata.get("title", metadata.get("name", metadata.get("topic", "untitled"))),
+            "chunk_index": metadata.get("chunk_index"),
+            "source": metadata.get("source_file", metadata.get("source", "untitled")),
+            "page_content": _doc.page_content,
+            "similarity": _similarity
+        }
+        ctx = f"""{document['title']} page_content: {document['page_content']},  from source: {document['source']}.\n\n"""
+        context += ctx
+        results.append(document)
+    return {
+        "documents": results,
+        "context": context
+    }
+def get_references(docs, threshold: float):
+    results = []
+    context = ""
+    for doc in docs:
+        _doc = doc[0]
+        _similarity = 1 - doc[1]
+        if _similarity < threshold:
+            continue
+        metadata = _doc.metadata
+        document = {
+            "title": metadata.get("title", metadata.get("name", metadata.get("topic", "untitled"))),
+            "chunk_index": metadata.get("chunk_index"),
+            "source": metadata.get("source_file", metadata.get("source", "untitled")),
+            "page_content": _doc.page_content,
+            "similarity": _similarity
+        }
+        ctx = f"""
+        page_content: {document['page_content']},  from source: {document['source']}.
+        """
+        context += ctx
+        results.append(document)
+    return {
+        "documents": results,
+        "context": context
+    }
+def create_documents(
+    chunks: List[str],
+    filePath: Optional[Path] = None,
+    built_in_metadata: Optional[dict] = {},
+    title: Optional[str] = None
+) -> List[Document]:
+    """
+    Create Document objects from text chunks with standard metadata (UUIDs, timestamps, indices).
+    Works for both files (filePath provided) and raw text (filePath=None).
+    """
+    if filePath and filePath.exists():
+        created_date = datetime.fromtimestamp(filePath.stat().st_ctime).isoformat()
+        modified_date = datetime.fromtimestamp(filePath.stat().st_mtime).isoformat()
+        source = filePath.name
+        given_title = title or filePath.stem
+    else:
+        now = datetime.now().isoformat()
+        created_date = now
+        modified_date = now
+        # Use existing source from metadata if available, else empty
+        source = built_in_metadata.get("source", "")
+        if not source and filePath:
+             source = filePath.name
+        given_title = title or built_in_metadata.get("title", "Untitled")
+    docs = []
+    for i, chunk in enumerate(chunks):
+        # Base metadata
+        metadata = {
+            "doc_id": str(uuid.uuid4()),   # unique chunk id
+            "source": source,
+            "title": given_title,
+            "created_date": created_date,
+            "modified_date": modified_date,
+            "chunk_index": i,
+        }
+        # Merge built-in, but don't overwrite our system fields if they exist
+        # actually, built-in should probably take precedence for some things?
+        # Let's simple merge:
+        metadata.update(built_in_metadata)
+        # Ensure our critical fields are set correctly after merge (if built-in had conflict)
+        metadata["doc_id"] = metadata.get("doc_id", str(uuid.uuid4()))
+        metadata["chunk_index"] = i
+        doc = Document(page_content=chunk, metadata=metadata)
+        docs.append(doc)
+    return docs
+def create_document(
+    text: str,
+    metadata: dict
+):
+    return Document(page_content=text, metadata=metadata)
+def clean_metadata(metadata: dict):
+    cleaned = {}
+    for k, v in metadata.items():
+        if isinstance(v, (datetime, date)):
+            cleaned[k] = v.isoformat()
+        elif isinstance(v, ALLOWED):
+            cleaned[k] = v
+        else:
+            cleaned[k] = str(v)
+    return cleaned
+def read_text_file(filePath: Path):
+    with open(filePath, "r", encoding="utf-8") as f:
+        content = f.read()
+    return content
+def read_json_file(filePath: Path):
+    with open(filePath, 'r') as file:
+        data = json.load(file)
+    return data
+def build_metadata(filePath: Optional[Path] = None, content: Optional[str] = None):
+    if filePath:
+        content = read_text_file(filePath)
+    parts = content.split("---", 2)
+    if len(parts) >= 3:
+        frontmatter = yaml.safe_load(parts[1]) or {}
+        frontmatter = clean_metadata(frontmatter)
+        # add file name as source always
+        if filePath:
+            frontmatter["source"] = filePath.name
+        elif "source" not in frontmatter:
+            frontmatter["source"] = ""
+        return {
+            "metadata": frontmatter,
+            "content": parts[2].strip()
+        }
+    else:
+        # Don't enforce empty source if not provided, allows external metadata to stick
+        meta = {}
+        if filePath:
+            meta["source"] = filePath.name
+        return {
+            "metadata": meta,
+            "content": content.strip()
+        }
+def create_documents_from_text(text: str, metadata: dict = {}):
+    """
+    Create documents from raw text with automatic splitting and metadata enrichment.
+    """
+    text = text.strip()
+    data = build_metadata(content=text)
+    # 1. Smart Metadata Merge
+    final_metadata = data["metadata"].copy()
+    # Update with provided metadata
+    if final_metadata.get("source") == "" and metadata.get("source"):
+        final_metadata["source"] = metadata["source"]
+    # Merge regular keys
+    final_metadata.update({k:v for k,v in metadata.items() if k != "source"})
+    text = data["content"]
+    # 2. Split text into chunks (strings)
+    # Use section-aware splitter if text contains markdown section delimiters
+    if "\n---\n" in text or text.startswith("---\n"):
+        splitter = TextSplitter.for_markdown_with_sections()
+    else:
+        splitter = TextSplitter()
+    chunks = splitter.split_text(text)
+    # 3. Create documents using standard helper (adds IDs, indices, dates)
+    return create_documents(
+        chunks=chunks,
+        filePath=None,
+        built_in_metadata=final_metadata
+    )
+def load_json(filePath: Path):
+    data = read_json_file(filePath=filePath)
+    filePath = Path(filePath)
+    file_name = filePath.name
+    metadata = {
+        "id": data["id"],
+        "title": data.get("name", data.get("title", "Untitled")),
+        "source": data["source"],
+        "source_file": file_name or "Untitled",
+        "created_date": datetime.now().isoformat()
+    }
+    docs= []
+    splitter = TextSplitter()
+    for key,value in data["content"].items():
+        ctx = splitter.split_text(value.strip())
+        for idx, chunk in enumerate(ctx):
+            if(chunk.strip() == ""):
+                continue
+            else:
+                chunk = f"{key}: {chunk.strip()}"
+                docs.append(Document(page_content=chunk, metadata={**metadata, "topic": key, "chunk_index": idx}))
+    return docs

app/utils/embeddings.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+from dotenv import load_dotenv
+load_dotenv()
+def get_embedding_model():
+    embeddings = GoogleGenerativeAIEmbeddings(
+        model="models/gemini-embedding-001"
+    )
+    return embeddings

app/utils/llm_models.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from pathlib import Path
+from langchain_community.llms import LlamaCpp
+from langchain_community.chat_models import ChatLlamaCpp
+from app.core.config import settings
+model_file = Path(settings.model_path) / settings.local_model_name
+def load_model():
+    return ChatLlamaCpp(
+        model_path=str(model_file),  # Direct path
+        n_ctx=8192,
+        n_batch=512,
+        n_threads=4,
+        temperature=0.05,
+        top_p=0.8,
+        top_k=20,
+        repeat_penalty=1.1,
+        f16_kv=True,
+        verbose=False,
+    )

app/utils/model_factory.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+Model factory for creating LLM and embedding models.
+Handles model switching and fallback logic.
+"""
+from typing import Optional
+from pathlib import Path
+from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
+from langchain_community.chat_models import ChatLlamaCpp
+from app.core.config import settings
+import logging
+logger = logging.getLogger(__name__)
+def get_embedding_model():
+    """
+    Get the embedding model (currently only Gemini).
+    Returns:
+        GoogleGenerativeAIEmbeddings: Embedding model instance
+    """
+    try:
+        embeddings = GoogleGenerativeAIEmbeddings(
+            model=settings.embedding_model_name,
+            google_api_key=settings.google_api_key
+        )
+        logger.info(f"Loaded embedding model: {settings.embedding_model_name}")
+        return embeddings
+    except Exception as e:
+        logger.error(f"Failed to load embedding model: {e}")
+        raise
+def get_gemini_model():
+    """
+    Get Google Gemini chat model.
+    Returns:
+        ChatGoogleGenerativeAI: Gemini model instance
+    """
+    try:
+        model = ChatGoogleGenerativeAI(
+            model=settings.gemini_model_name,
+            google_api_key=settings.google_api_key,
+        )
+        logger.info(f"Loaded Gemini model: {settings.gemini_model_name}")
+        return model
+    except Exception as e:
+        logger.error(f"Failed to load Gemini model: {e}")
+        raise
+def get_local_model():
+    """
+    Get local Qwen model (LlamaCpp).
+    Returns:
+        ChatLlamaCpp: Local model instance
+    """
+    try:
+        model_file = settings.model_path / settings.local_model_name
+        if not model_file.exists():
+            raise FileNotFoundError(
+                f"Model file not found: {model_file}\n"
+                f"Please download it to {settings.model_path}/"
+            )
+        # model = ChatLlamaCpp(
+        #     model_path=str(model_file),
+        #     n_ctx=4096,          # Context window size
+        #     n_batch=512,         # Batch size for prompt processing
+        #     n_threads=4,         # Number of CPU threads
+        #     max_tokens=settings.local_max_tokens,  # Maximum tokens to generate
+        #     temperature=0.05,    # Low temperature for more focused responses
+        #     top_p=0.8,          # Nucleus sampling
+        #     top_k=20,           # Top-k sampling
+        #     repeat_penalty=1.1, # Penalty for repetition
+        #     f16_kv=True,        # Use half-precision for KV cache
+        #     verbose=False,
+        # )
+        model = ChatLlamaCpp(
+            model_path=str(model_file),
+            n_ctx=8096,           # Small context to fit ~2GB total RAM usage [web:14]
+            n_batch=512,          # Smaller batch for low memory throughput
+            n_threads=4,          # Conservative threads (avoid RAM thrashing on 4GB) [web:12]
+            max_tokens= settings.local_max_tokens,       # Short responses keep memory low
+            temperature=0.1,      # Focused output, less randomness
+            top_p=0.9,
+            top_k=30,
+            repeat_penalty=1.05,
+            f16_kv=True,          # Essential half-precision KV cache [web:14]
+            f16=True,             # Full f16 where possible
+            verbose=True,
+            chat_format="chatml",   # Proper templating
+            # Low-RAM must-haves:
+            numa=False,           # Disable NUMA for single-CPU setups
+            use_mlock=False,      # Skip memory locking (saves overhead)
+            use_mmap=True,        # Memory-map model file (streams from disk)
+        )
+        # model = ChatLlamaCpp(
+        #     model_path=str(model_file),
+        #     n_ctx=4096,           # Small context to fit ~2GB total RAM usage [web:14]
+        #     n_batch=512,          # Smaller batch for low memory throughput
+        #     n_threads=4,          # Conservative threads (avoid RAM thrashing on 4GB) [web:12]
+        #     max_tokens= settings.local_max_tokens,       # Short responses keep memory low
+        #     temperature=0.1,      # Focused output, less randomness
+        #     top_p=0.9,
+        #     min_p=0.15,
+        #     top_k=30,
+        #     repeat_penalty=1.05,
+        #     f16_kv=True,          # Essential half-precision KV cache [web:14]
+        #     f16=True,             # Full f16 where possible
+        #     verbose=False,
+        #     chat_format="qwen",   # Proper templating,
+        #     verbos=True
+        # )
+        logger.info(f"Loaded local model: {settings.local_model_name}")
+        return model
+    except Exception as e:
+        logger.error(f"Failed to load local model: {e}")
+        raise
+def get_llm_model(provider: Optional[str] = None):
+    """
+    Get LLM model based on configuration with fallback support.
+    Args:
+        provider: Override the default provider ("gemini" or "local")
+                 If None, uses settings.llm_provider
+    Returns:
+        LLM model instance (Gemini or Local)
+    Raises:
+        RuntimeError: If all models fail to load
+    """
+    provider = provider or settings.llm_provider
+    if provider == "gemini":
+        print("gemini loaded")
+        try:
+            return get_gemini_model()
+        except Exception as e:
+            logger.warning(f"Gemini model failed: {e}")
+            if settings.enable_fallback:
+                logger.info("Falling back to local model...")
+                return get_local_model()
+            raise
+    elif provider == "local":
+        print("local loaded")
+        try:
+            return get_local_model()
+        except Exception as e:
+            logger.warning(f"Local model failed: {e}")
+            if settings.enable_fallback:
+                logger.info("Falling back to Gemini model...")
+                return get_gemini_model()
+            raise
+    else:
+        raise ValueError(f"Unknown provider: {provider}. Use 'gemini' or 'local'")

app/utils/preprocessing.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from typing import List
+from .constants import stopwords, short_words_mappings
+from langchain_core.documents import Document
+from app.utils.model_factory import get_local_model
+from nltk.stem import PorterStemmer
+import spacy
+from pathlib import Path
+import re
+nlp = spacy.load('en_core_web_sm')
+def lowercase(text: str):
+    return text.strip()
+def tokenization(text: str):
+    if text is None or len(text) == 0:
+        return []
+    results = lowercase(text).split(" ")
+    return results
+def stop_words_removal(text: str, short_words_mapping: bool = False):
+    if not text:
+        return []
+    doc = nlp(text)
+    results = []
+    for token in doc:
+        if token.is_space:
+            continue
+        if token.pos_ not in ["NOUN", "PROPN", "VERB", "NUM", "ADJ"]:
+            continue
+        word = token.text.lower()
+        if short_words_mapping and word in short_words_mappings:
+            word = short_words_mappings[word]
+            doc2 = nlp(word)
+            lemma = doc2[0].lemma_
+        else:
+            lemma = token.lemma_
+        lemma = lemma.strip().lower()
+        if lemma and lemma not in stopwords:
+            results.append(lemma)
+    return results
+def space_removal(words: List[str]):
+    results = []
+    for word in words:
+        word = word.strip()
+        if(word == ""):
+            continue
+        results.append(word.strip())
+    return results
+def preprocess(text: str, short_words_mapping: bool = False) -> str:
+    if text is None or len(text) == 0:
+        raise ValueError("Text cannot be empty")
+    stop_words_removed_chunks = stop_words_removal(text, short_words_mapping)
+    return " ".join(stop_words_removed_chunks)
+def normalize(text: str) -> str:
+    if text is None or len(text) == 0:
+        raise ValueError("Text cannot be empty")
+    chunks = tokenization(text)
+    r1 = space_removal(chunks)
+    return " ".join(r1)
+def preprocess_document(doc: Document):
+    if(doc.page_content == ""):
+        return
+    doc.page_content = preprocess(doc.page_content)
+def preprocess_documents(docs: List[Document]):
+    for doc in docs:
+        preprocess_document(doc)
+def preprocess_query(query: str) -> str:
+    if query is None or len(query.strip()) == 0:
+        raise ValueError("Query cannot be empty")
+#     model = get_local_model()
+#     prompt = f"""Rewrite this query for better semantic search/embeddings:
+# Make it more descriptive, clear, natural. Keep core intent.
+# Query: "{query}"
+# Improved:
+# """
+#     response = model.invoke(prompt)
+#     cleaned = re.sub(r'^\s*Improved:\s*', '', response.content.strip(), flags=re.IGNORECASE).strip()
+    return normalize(query)
+def preprocess_filename(filePath: Path) -> str:
+    file_name = filePath.name
+    name = Path(file_name).stem
+    ext = Path(file_name).suffix.lower()
+    # Remove special characters but keep letters, numbers, _ and -
+    safe_name = re.sub(r'[^a-zA-Z0-9_-]', '', name)
+    # Convert to lowercase
+    safe_name = safe_name.lower()
+    # Fallback if name becomes empty (e.g. "!!!.pdf")
+    if not safe_name:
+        safe_name = "file"
+    return safe_name + ext

app/utils/tests.py ADDED Viewed

The diff for this file is too large to render. See raw diff