Athul Nambiar commited on
Commit
a61a47b
·
1 Parent(s): a0bff09

Deploy from QUADRANT_RAG

Browse files
Files changed (9) hide show
  1. .gitattributes +0 -35
  2. .gitignore +1 -0
  3. .streamlit/config.toml +15 -0
  4. README.md +107 -10
  5. app.py +768 -0
  6. data/docs.json +393 -0
  7. rag_core.py +685 -0
  8. requirements.txt +7 -0
  9. scripts/push_to_hf.sh +65 -0
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /.venv
.streamlit/config.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [server]
2
+ port = 7860
3
+ address = "0.0.0.0"
4
+ headless = true
5
+ enableCORS = false
6
+ enableXsrfProtection = false
7
+
8
+ [theme]
9
+ primaryColor = "#2196f3"
10
+ backgroundColor = "#ffffff"
11
+ secondaryBackgroundColor = "#f0f2f6"
12
+ textColor = "#262730"
13
+
14
+ [browser]
15
+ gatherUsageStats = false
README.md CHANGED
@@ -1,10 +1,107 @@
1
- ---
2
- title: Pyqsprag
3
- emoji: 📚
4
- colorFrom: green
5
- colorTo: indigo
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🩺 QUADRANT RAG - Medical AI Assistant
2
+
3
+ A sophisticated Retrieval-Augmented Generation (RAG) system optimized for medical education, specifically designed for MRCS, USMLE, and NEET-PG exam preparation.
4
+
5
+ ## 🚀 Features
6
+
7
+ - **🌐 Cloud-Native**: Uses Qdrant Cloud for scalable vector storage
8
+ - **🧠 Advanced AI**: Powered by OpenAI GPT-4o-mini for medical responses
9
+ - **📚 Medical-Optimized**: Specialized for Harrison's Principles and medical textbooks
10
+ - **🔍 Semantic Search**: Advanced embedding-based document retrieval
11
+ - **📖 Citation System**: Proper source attribution with page references
12
+ - **💊 Clinical Pearls**: Highlights crucial practical medical insights
13
+
14
+ ## 🛠️ Technology Stack
15
+
16
+ - **Frontend**: Streamlit (optimized for HuggingFace Spaces)
17
+ - **Vector Database**: Qdrant Cloud
18
+ - **LLM**: OpenAI GPT-4o-mini
19
+ - **Embeddings**: sentence-transformers/all-MiniLM-L6-v2
20
+ - **PDF Processing**: pypdf with medical text optimization
21
+
22
+ ## 🔧 Setup
23
+
24
+ ### Environment Variables
25
+
26
+ Create a `.env` file with your credentials:
27
+
28
+ ```env
29
+ # OpenAI Configuration
30
+ OPENAI_API_KEY=your-openai-api-key-here
31
+
32
+ # Qdrant Cloud Configuration
33
+ QDRANT_URL=https://your-cluster-url.qdrant.tech
34
+ QDRANT_API_KEY=your-qdrant-api-key-here
35
+ QDRANT_COLLECTION_NAME=documents
36
+
37
+ # Application Configuration
38
+ USE_MEMORY_DB=false
39
+ STREAMLIT_SERVER_PORT=7860
40
+ STREAMLIT_SERVER_ADDRESS=0.0.0.0
41
+ ```
42
+
43
+ ### Local Development
44
+
45
+ ```bash
46
+ pip install -r requirements.txt
47
+ streamlit run app.py
48
+ ```
49
+
50
+ ### HuggingFace Spaces Deployment
51
+
52
+ 1. Upload your repository to HuggingFace Spaces
53
+ 2. Set the environment variables in your Space settings
54
+ 3. The app will automatically deploy with Streamlit
55
+
56
+ ## 📖 Usage
57
+
58
+ 1. **Upload Documents**: Use the sidebar to upload medical PDF documents
59
+ 2. **Select Active Document**: Click on a document to make it active for querying
60
+ 3. **Ask Medical Questions**: Type natural language medical queries
61
+ 4. **Review Responses**: Get structured, exam-focused answers with citations
62
+
63
+ ### Sample Questions
64
+
65
+ - "What are the diagnostic criteria for fever of unknown origin?"
66
+ - "Explain the differential diagnosis of anion gap metabolic acidosis"
67
+ - "What is the workup for neutropenic fever?"
68
+ - "Describe the MUDPILES mnemonic for metabolic acidosis"
69
+
70
+ ## 🏥 Medical Education Focus
71
+
72
+ The system is specifically optimized for:
73
+
74
+ - **Structured Learning**: Organized responses by clinical categories
75
+ - **Exam Preparation**: MRCS, USMLE, NEET-PG focused content
76
+ - **Clinical Pearls**: Highlighted practical insights
77
+ - **Evidence-Based**: Citations from authoritative medical texts
78
+ - **Differential Diagnosis**: Systematic approach to medical conditions
79
+
80
+ ## 🔐 Security & Privacy
81
+
82
+ - API keys are stored securely in environment variables
83
+ - No medical data is persistently stored
84
+ - All processing happens in secure cloud infrastructure
85
+ - Compliant with medical education standards
86
+
87
+ ## 📊 System Architecture
88
+
89
+ ```
90
+ PDF Upload → Text Extraction → Chunking → Embedding → Qdrant Cloud
91
+
92
+ User Query → Query Expansion → Vector Search → Context Retrieval
93
+
94
+ Context + Query → GPT-4o-mini → Medical Response → Citations
95
+ ```
96
+
97
+ ## 🤝 Contributing
98
+
99
+ This system is designed for medical education and research purposes. Contributions focused on improving medical accuracy and educational value are welcome.
100
+
101
+ ## ⚠️ Disclaimer
102
+
103
+ This tool is for educational purposes only and should not be used for actual clinical decision-making. Always consult authoritative medical sources and qualified healthcare professionals for patient care decisions.
104
+
105
+ ## 📄 License
106
+
107
+ Educational and research use only. Please respect medical education licensing and copyright requirements.
app.py ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ QUADRANT RAG System - Enhanced UI v2 with Document Library
4
+ Professional chat interface with persistent document storage
5
+ """
6
+
7
+ import os
8
+ import streamlit as st
9
+ import json
10
+ import uuid
11
+ import time
12
+ from typing import List, Dict, Any, Optional
13
+ from pathlib import Path
14
+ from datetime import datetime, timezone
15
+ import tempfile
16
+ import base64
17
+
18
+ # Load environment variables first
19
+ import os
20
+ from dotenv import load_dotenv
21
+ load_dotenv()
22
+
23
+ # Import RAG components
24
+ from rag_core import DynamicRAG, extract_pdf_pages, create_chunks
25
+
26
+ # Page configuration
27
+ st.set_page_config(
28
+ page_title="QUADRANT RAG - AI Document Assistant",
29
+ page_icon="🤖",
30
+ layout="wide",
31
+ initial_sidebar_state="expanded"
32
+ )
33
+
34
+ # Enhanced CSS for modern UI
35
+ st.markdown("""
36
+ <style>
37
+ /* CSS Variables for theme */
38
+ :root {
39
+ --primary-color: #5468ff;
40
+ --secondary-color: #6c63ff;
41
+ --accent-color: #00d4ff;
42
+ --background-color: #f8f9fa;
43
+ --card-background: #ffffff;
44
+ --text-primary: #2c3e50;
45
+ --text-secondary: #718096;
46
+ --border-color: #e2e8f0;
47
+ --shadow-sm: 0 2px 4px rgba(0,0,0,0.05);
48
+ --shadow-md: 0 4px 12px rgba(0,0,0,0.08);
49
+ --shadow-lg: 0 10px 30px rgba(0,0,0,0.1);
50
+ }
51
+
52
+ /* Reset and base styles */
53
+ .main {
54
+ padding: 0;
55
+ background-color: var(--background-color);
56
+ }
57
+
58
+ .stApp {
59
+ background-color: var(--background-color);
60
+ }
61
+
62
+ /* Sidebar styling */
63
+ section[data-testid="stSidebar"] {
64
+ background-color: var(--card-background);
65
+ border-right: 1px solid var(--border-color);
66
+ box-shadow: 2px 0 5px rgba(0,0,0,0.05);
67
+ }
68
+
69
+ section[data-testid="stSidebar"] .block-container {
70
+ padding: 1.5rem 1rem;
71
+ }
72
+
73
+ /* Header */
74
+ .main-header {
75
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
76
+ color: white;
77
+ padding: 1.5rem 2rem;
78
+ margin: -1rem -1rem 1rem -1rem;
79
+ box-shadow: var(--shadow-md);
80
+ }
81
+
82
+ .main-header h1 {
83
+ margin: 0;
84
+ font-size: 2rem;
85
+ font-weight: 700;
86
+ }
87
+
88
+ .main-header p {
89
+ margin: 0.5rem 0 0 0;
90
+ opacity: 0.9;
91
+ font-size: 1.1rem;
92
+ }
93
+
94
+ /* Document library styles */
95
+ .doc-library-header {
96
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
97
+ margin: -1.5rem -1rem 1rem -1rem;
98
+ padding: 1.5rem 1rem;
99
+ color: white;
100
+ }
101
+
102
+ .doc-library-header h3 {
103
+ margin: 0;
104
+ font-size: 1.3rem;
105
+ font-weight: 600;
106
+ }
107
+
108
+ .doc-count {
109
+ font-size: 0.9rem;
110
+ opacity: 0.9;
111
+ margin-top: 0.25rem;
112
+ }
113
+
114
+ /* Document cards in sidebar */
115
+ .doc-card {
116
+ background: var(--card-background);
117
+ border: 1px solid var(--border-color);
118
+ border-radius: 8px;
119
+ padding: 1rem;
120
+ margin-bottom: 0.75rem;
121
+ cursor: pointer;
122
+ transition: all 0.2s ease;
123
+ }
124
+
125
+ .doc-card:hover {
126
+ transform: translateY(-2px);
127
+ box-shadow: var(--shadow-md);
128
+ border-color: var(--primary-color);
129
+ }
130
+
131
+ .doc-card.active {
132
+ border-color: var(--primary-color);
133
+ background: linear-gradient(to right, #f0f4ff 0%, #ffffff 100%);
134
+ box-shadow: var(--shadow-md);
135
+ }
136
+
137
+ .doc-card-title {
138
+ font-weight: 600;
139
+ color: var(--text-primary);
140
+ margin-bottom: 0.25rem;
141
+ font-size: 0.95rem;
142
+ }
143
+
144
+ .doc-card-meta {
145
+ font-size: 0.8rem;
146
+ color: var(--text-secondary);
147
+ display: flex;
148
+ justify-content: space-between;
149
+ align-items: center;
150
+ }
151
+
152
+ .doc-card-date {
153
+ font-size: 0.75rem;
154
+ color: var(--text-secondary);
155
+ margin-top: 0.25rem;
156
+ }
157
+
158
+ /* Chat interface */
159
+ .chat-container {
160
+ height: calc(100vh - 200px);
161
+ display: flex;
162
+ flex-direction: column;
163
+ background: var(--card-background);
164
+ border-radius: 12px;
165
+ box-shadow: var(--shadow-lg);
166
+ margin: 1rem;
167
+ overflow: hidden;
168
+ }
169
+
170
+ .chat-header {
171
+ background: linear-gradient(135deg, #f0f4ff 0%, #e8eeff 100%);
172
+ border-bottom: 1px solid var(--border-color);
173
+ padding: 1.5rem;
174
+ }
175
+
176
+ .chat-header-title {
177
+ font-size: 1.1rem;
178
+ font-weight: 600;
179
+ color: var(--text-primary);
180
+ margin: 0;
181
+ white-space: nowrap;
182
+ overflow: hidden;
183
+ text-overflow: ellipsis;
184
+ }
185
+
186
+ .chat-header-subtitle {
187
+ font-size: 0.9rem;
188
+ color: var(--text-secondary);
189
+ margin-top: 0.25rem;
190
+ }
191
+
192
+ .chat-messages {
193
+ flex: 1;
194
+ overflow-y: auto;
195
+ padding: 1.5rem;
196
+ background: #fafbfc;
197
+ }
198
+
199
+ /* Message styles */
200
+ .message {
201
+ margin-bottom: 1.5rem;
202
+ animation: fadeInUp 0.3s ease;
203
+ }
204
+
205
+ @keyframes fadeInUp {
206
+ from {
207
+ opacity: 0;
208
+ transform: translateY(10px);
209
+ }
210
+ to {
211
+ opacity: 1;
212
+ transform: translateY(0);
213
+ }
214
+ }
215
+
216
+ .message.user {
217
+ display: flex;
218
+ justify-content: flex-end;
219
+ }
220
+
221
+ .message.assistant {
222
+ display: flex;
223
+ justify-content: flex-start;
224
+ }
225
+
226
+ .message-content {
227
+ max-width: 70%;
228
+ padding: 1rem 1.25rem;
229
+ border-radius: 18px;
230
+ position: relative;
231
+ animation: scaleIn 0.2s ease;
232
+ }
233
+
234
+ @keyframes scaleIn {
235
+ from {
236
+ transform: scale(0.95);
237
+ }
238
+ to {
239
+ transform: scale(1);
240
+ }
241
+ }
242
+
243
+ .message.user .message-content {
244
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
245
+ color: white;
246
+ border-bottom-right-radius: 4px;
247
+ }
248
+
249
+ .message.assistant .message-content {
250
+ background: white;
251
+ border: 1px solid var(--border-color);
252
+ color: var(--text-primary);
253
+ border-bottom-left-radius: 4px;
254
+ }
255
+
256
+ /* Avatar */
257
+ .message-avatar {
258
+ width: 36px;
259
+ height: 36px;
260
+ border-radius: 50%;
261
+ display: flex;
262
+ align-items: center;
263
+ justify-content: center;
264
+ font-weight: 600;
265
+ margin: 0 0.75rem;
266
+ }
267
+
268
+ .message.user .message-avatar {
269
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
270
+ color: white;
271
+ }
272
+
273
+ .message.assistant .message-avatar {
274
+ background: linear-gradient(135deg, #f0f4ff 0%, #e8eeff 100%);
275
+ color: var(--primary-color);
276
+ }
277
+
278
+ /* Citations */
279
+ .citations {
280
+ margin-top: 0.75rem;
281
+ padding-top: 0.75rem;
282
+ border-top: 1px solid rgba(0,0,0,0.1);
283
+ }
284
+
285
+ .citation-item {
286
+ background: rgba(0,0,0,0.05);
287
+ padding: 0.5rem 0.75rem;
288
+ border-radius: 8px;
289
+ margin-top: 0.5rem;
290
+ font-size: 0.85rem;
291
+ border-left: 3px solid var(--accent-color);
292
+ }
293
+
294
+ /* Input area */
295
+ .chat-input-container {
296
+ border-top: 1px solid var(--border-color);
297
+ background: white;
298
+ padding: 1.5rem;
299
+ }
300
+
301
+ .chat-input-wrapper {
302
+ display: flex;
303
+ gap: 0.75rem;
304
+ align-items: flex-end;
305
+ }
306
+
307
+ .chat-input {
308
+ flex: 1;
309
+ background: var(--background-color);
310
+ border: 2px solid var(--border-color);
311
+ border-radius: 12px;
312
+ padding: 0.75rem 1rem;
313
+ font-size: 1rem;
314
+ transition: all 0.2s ease;
315
+ resize: none;
316
+ min-height: 50px;
317
+ }
318
+
319
+ .chat-input:focus {
320
+ outline: none;
321
+ border-color: var(--primary-color);
322
+ background: white;
323
+ }
324
+
325
+ .send-button {
326
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
327
+ color: white;
328
+ border: none;
329
+ border-radius: 12px;
330
+ padding: 0.75rem 1.5rem;
331
+ font-size: 1rem;
332
+ font-weight: 600;
333
+ cursor: pointer;
334
+ transition: all 0.2s ease;
335
+ display: flex;
336
+ align-items: center;
337
+ gap: 0.5rem;
338
+ }
339
+
340
+ .send-button:hover {
341
+ transform: translateY(-2px);
342
+ box-shadow: var(--shadow-md);
343
+ }
344
+
345
+ .send-button:active {
346
+ transform: translateY(0);
347
+ }
348
+
349
+ /* Typing indicator */
350
+ .typing-indicator {
351
+ display: inline-flex;
352
+ padding: 0.75rem 1rem;
353
+ background: white;
354
+ border-radius: 18px;
355
+ border: 1px solid var(--border-color);
356
+ gap: 4px;
357
+ }
358
+
359
+ .typing-dot {
360
+ width: 8px;
361
+ height: 8px;
362
+ background: var(--primary-color);
363
+ border-radius: 50%;
364
+ animation: typing 1.4s infinite;
365
+ }
366
+
367
+ .typing-dot:nth-child(1) { animation-delay: -0.32s; }
368
+ .typing-dot:nth-child(2) { animation-delay: -0.16s; }
369
+
370
+ @keyframes typing {
371
+ 0%, 80%, 100% {
372
+ opacity: 0.5;
373
+ transform: scale(0.8);
374
+ }
375
+ 40% {
376
+ opacity: 1;
377
+ transform: scale(1);
378
+ }
379
+ }
380
+
381
+ /* Upload area */
382
+ .upload-area {
383
+ border: 2px dashed var(--primary-color);
384
+ border-radius: 12px;
385
+ padding: 3rem;
386
+ text-align: center;
387
+ background: linear-gradient(to bottom, #f0f4ff 0%, #ffffff 100%);
388
+ transition: all 0.3s ease;
389
+ margin: 1rem;
390
+ }
391
+
392
+ .upload-area:hover {
393
+ border-color: var(--secondary-color);
394
+ background: linear-gradient(to bottom, #e8eeff 0%, #f8f9ff 100%);
395
+ }
396
+
397
+ .upload-icon {
398
+ font-size: 4rem;
399
+ color: var(--primary-color);
400
+ margin-bottom: 1rem;
401
+ }
402
+
403
+ /* Empty state */
404
+ .empty-state {
405
+ text-align: center;
406
+ padding: 3rem;
407
+ color: var(--text-secondary);
408
+ }
409
+
410
+ .empty-state-icon {
411
+ font-size: 4rem;
412
+ opacity: 0.3;
413
+ margin-bottom: 1rem;
414
+ }
415
+
416
+ /* Buttons */
417
+ .stButton > button {
418
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
419
+ color: white;
420
+ border: none;
421
+ padding: 0.75rem 1.5rem;
422
+ border-radius: 8px;
423
+ font-weight: 600;
424
+ transition: all 0.2s ease;
425
+ }
426
+
427
+ .stButton > button:hover {
428
+ transform: translateY(-2px);
429
+ box-shadow: var(--shadow-md);
430
+ }
431
+
432
+ /* Hide Streamlit defaults */
433
+ #MainMenu {visibility: hidden;}
434
+ footer {visibility: hidden;}
435
+
436
+ /* Scrollbar */
437
+ ::-webkit-scrollbar {
438
+ width: 8px;
439
+ height: 8px;
440
+ }
441
+
442
+ ::-webkit-scrollbar-track {
443
+ background: var(--background-color);
444
+ }
445
+
446
+ ::-webkit-scrollbar-thumb {
447
+ background: var(--border-color);
448
+ border-radius: 4px;
449
+ }
450
+
451
+ ::-webkit-scrollbar-thumb:hover {
452
+ background: var(--text-secondary);
453
+ }
454
+ </style>
455
+ """, unsafe_allow_html=True)
456
+
457
+ # Initialize session state
458
+ if 'rag_system' not in st.session_state:
459
+ st.session_state.rag_system = None
460
+ if 'current_doc' not in st.session_state:
461
+ st.session_state.current_doc = None
462
+ if 'chat_history' not in st.session_state:
463
+ st.session_state.chat_history = []
464
+ if 'all_documents' not in st.session_state:
465
+ st.session_state.all_documents = []
466
+ if 'processing' not in st.session_state:
467
+ st.session_state.processing = False
468
+ if 'waiting_for_response' not in st.session_state:
469
+ st.session_state.waiting_for_response = False
470
+
471
+ def init_rag_system():
472
+ """Initialize the RAG system"""
473
+ try:
474
+ # Check environment variables
475
+ from dotenv import load_dotenv
476
+ load_dotenv() # Reload environment variables
477
+
478
+ openai_key = os.environ.get('OPENAI_API_KEY', '')
479
+ qdrant_url = os.environ.get('QDRANT_URL', '')
480
+ qdrant_key = os.environ.get('QDRANT_API_KEY', '')
481
+
482
+ if not openai_key or openai_key == 'your-openai-api-key-here':
483
+ st.error("❌ OpenAI API key not configured. Please set OPENAI_API_KEY in your environment.")
484
+ return False
485
+
486
+ if not qdrant_url or not qdrant_key:
487
+ st.warning("⚠️ Qdrant Cloud credentials not found. Using in-memory storage.")
488
+
489
+ with st.spinner("🔄 Initializing RAG System..."):
490
+ st.session_state.rag_system = DynamicRAG()
491
+ # Load all documents from Qdrant
492
+ st.session_state.all_documents = st.session_state.rag_system.get_all_documents()
493
+
494
+ st.success("✅ RAG System initialized successfully!")
495
+ return True
496
+ except Exception as e:
497
+ st.error(f"❌ Failed to initialize RAG system: {str(e)}")
498
+ return False
499
+
500
+ def process_pdf_upload(uploaded_file) -> Optional[Dict[str, Any]]:
501
+ """Process uploaded PDF file"""
502
+ try:
503
+ st.session_state.processing = True
504
+
505
+ # Save uploaded file
506
+ temp_path = Path(tempfile.gettempdir()) / f"{uuid.uuid4().hex}.pdf"
507
+ with open(temp_path, "wb") as f:
508
+ f.write(uploaded_file.getvalue())
509
+
510
+ # Extract text
511
+ pages = extract_pdf_pages(str(temp_path))
512
+
513
+ # Create chunks
514
+ chunks = create_chunks(pages, chunk_size=3000, overlap=200)
515
+
516
+ # Generate document ID
517
+ doc_id = f"{uploaded_file.name.replace('.pdf', '')}_{int(time.time())}"
518
+
519
+ # Store in Qdrant
520
+ st.session_state.rag_system.store_document(doc_id, chunks)
521
+
522
+ # Create document info
523
+ doc_info = {
524
+ 'doc_id': doc_id,
525
+ 'title': uploaded_file.name,
526
+ 'pages': len(pages),
527
+ 'chunks': len(chunks),
528
+ 'upload_time': datetime.now(timezone.utc).isoformat()
529
+ }
530
+
531
+ # Update documents list
532
+ st.session_state.all_documents = st.session_state.rag_system.get_all_documents()
533
+
534
+ # Clean up
535
+ temp_path.unlink()
536
+
537
+ return doc_info
538
+
539
+ except Exception as e:
540
+ st.error(f"Error processing PDF: {str(e)}")
541
+ return None
542
+ finally:
543
+ st.session_state.processing = False
544
+
545
+ def query_document(question: str) -> tuple[str, List[Dict[str, Any]]]:
546
+ """Query the current document"""
547
+ try:
548
+ if not st.session_state.current_doc:
549
+ return "Please select a document first.", []
550
+
551
+ # Search in current document - increased for better coverage
552
+ search_results = st.session_state.rag_system.search(
553
+ query=question,
554
+ doc_id=st.session_state.current_doc['doc_id'],
555
+ top_k=10
556
+ )
557
+
558
+ if not search_results:
559
+ return "I couldn't find relevant information about that in the document.", []
560
+
561
+ # Generate answer
562
+ answer = st.session_state.rag_system.generate_answer(question, search_results)
563
+
564
+ # Check if the answer indicates insufficient evidence
565
+ insufficient_keywords = ["insufficient evidence", "couldn't find", "no relevant information", "cannot answer"]
566
+
567
+ # Prepare citations only if the answer has sufficient evidence
568
+ citations = []
569
+ if not any(keyword in answer.lower() for keyword in insufficient_keywords):
570
+ for i, result in enumerate(search_results[:3]):
571
+ citations.append({
572
+ 'page': result['page'],
573
+ 'text': result['text'][:150] + "..." if len(result['text']) > 150 else result['text'],
574
+ 'score': round(result['score'], 3)
575
+ })
576
+
577
+ return answer, citations
578
+
579
+ except Exception as e:
580
+ return f"Sorry, I encountered an error: {str(e)}", []
581
+
582
+ def render_sidebar():
583
+ """Render the document library sidebar"""
584
+ with st.sidebar:
585
+ # Header
586
+ st.markdown("""
587
+ <div class="doc-library-header">
588
+ <h3>📚 Document Library</h3>
589
+ <div class="doc-count">{} documents stored</div>
590
+ </div>
591
+ """.format(len(st.session_state.all_documents)), unsafe_allow_html=True)
592
+
593
+ # Upload new document
594
+ with st.expander("📤 Upload New Document", expanded=False):
595
+ uploaded_file = st.file_uploader(
596
+ "Choose a PDF file",
597
+ type=['pdf'],
598
+ label_visibility="collapsed",
599
+ disabled=st.session_state.processing
600
+ )
601
+
602
+ if uploaded_file and st.button("Upload", type="primary", use_container_width=True):
603
+ with st.spinner("Processing..."):
604
+ doc = process_pdf_upload(uploaded_file)
605
+ if doc:
606
+ st.success("✅ Document uploaded successfully!")
607
+ st.rerun()
608
+
609
+ # Document list
610
+ if st.session_state.all_documents:
611
+ st.markdown("### Your Documents")
612
+
613
+ for doc in st.session_state.all_documents:
614
+ # Check if this is the current document
615
+ is_active = (st.session_state.current_doc and
616
+ doc['doc_id'] == st.session_state.current_doc['doc_id'])
617
+
618
+ # Document card
619
+ card_class = "doc-card active" if is_active else "doc-card"
620
+
621
+ col1, col2 = st.columns([5, 1])
622
+
623
+ with col1:
624
+ if st.button(
625
+ f"📄 **{doc['title'][:30]}{'...' if len(doc['title']) > 30 else ''}**\n\n"
626
+ f"📊 {doc['pages']} pages • {doc['chunks']} chunks",
627
+ key=f"doc_{doc['doc_id']}",
628
+ use_container_width=True
629
+ ):
630
+ st.session_state.current_doc = doc
631
+ st.session_state.chat_history = []
632
+ st.rerun()
633
+
634
+ with col2:
635
+ if st.button("🗑️", key=f"del_{doc['doc_id']}",
636
+ help="Delete this document"):
637
+ if st.session_state.rag_system.delete_document(doc['doc_id']):
638
+ st.session_state.all_documents = st.session_state.rag_system.get_all_documents()
639
+ if (st.session_state.current_doc and
640
+ doc['doc_id'] == st.session_state.current_doc['doc_id']):
641
+ st.session_state.current_doc = None
642
+ st.session_state.chat_history = []
643
+ st.rerun()
644
+
645
+ else:
646
+ st.markdown("""
647
+ <div class="empty-state">
648
+ <div class="empty-state-icon">📭</div>
649
+ <p>No documents yet</p>
650
+ <p style="font-size: 0.85rem;">Upload your first PDF to get started</p>
651
+ </div>
652
+ """, unsafe_allow_html=True)
653
+
654
+ def render_chat_interface():
655
+ """Render the main chat interface"""
656
+ if not st.session_state.current_doc:
657
+ # No document selected
658
+ st.markdown("""
659
+ <div class="upload-area">
660
+ <div class="upload-icon">📚</div>
661
+ <h2>Welcome to QUADRANT RAG Medical Assistant</h2>
662
+ <p style="font-size: 1.1rem; color: #718096; margin-top: 1rem;">
663
+ Upload medical documents or select from your library to start AI-powered medical Q&A
664
+ </p>
665
+ <p style="font-size: 0.95rem; color: #a0aec0; margin-top: 0.5rem;">
666
+ ✨ Powered by OpenAI GPT-5-mini & Qdrant Cloud • Optimized for Medical Education
667
+ </p>
668
+ </div>
669
+ """, unsafe_allow_html=True)
670
+ else:
671
+ # Chat interface
672
+ title = st.session_state.current_doc['title']
673
+ # Truncate overly long titles for cleaner UI
674
+ display_title = (title[:100] + "…") if len(title) > 100 else title
675
+ pages = st.session_state.current_doc['pages']
676
+ chunks = st.session_state.current_doc['chunks']
677
+ st.markdown(
678
+ f"""
679
+ <div class=\"chat-header\">
680
+ <div class=\"chat-header-title\" title=\"{title}\">💬 Chatting with: {display_title}</div>
681
+ <div class=\"chat-header-subtitle\">{pages} pages • {chunks} chunks • Ask anything about this document</div>
682
+ </div>
683
+ """,
684
+ unsafe_allow_html=True,
685
+ )
686
+
687
+ # New chat UI using Streamlit's native components
688
+ if not st.session_state.chat_history:
689
+ st.info("Start a conversation about your document. Ask me to explain, summarize, or find specifics.")
690
+
691
+ for msg in st.session_state.chat_history:
692
+ if msg['type'] == 'user':
693
+ with st.chat_message("user"):
694
+ st.markdown(msg['content'])
695
+ else:
696
+ with st.chat_message("assistant"):
697
+ st.markdown(msg['content'])
698
+ if msg.get('citations'):
699
+ with st.expander(f"📚 {len(msg['citations'])} Sources"):
700
+ for i, cite in enumerate(msg['citations'], 1):
701
+ st.markdown(f"**[{i}] Page {cite['page']}** (Relevance: {cite['score']:.3f})")
702
+ st.text(cite['text'][:200] + "..." if len(cite['text']) > 200 else cite['text'])
703
+ st.divider()
704
+
705
+ # Chat input and immediate handling
706
+ if prompt := st.chat_input("Ask anything about this document…"):
707
+ st.session_state.chat_history.append({'type': 'user', 'content': prompt})
708
+ with st.chat_message("assistant"):
709
+ with st.spinner("Thinking..."):
710
+ answer, citations = query_document(prompt)
711
+ st.session_state.chat_history.append({
712
+ 'type': 'assistant',
713
+ 'content': answer,
714
+ 'citations': citations if citations else None
715
+ })
716
+ st.markdown(answer)
717
+ if citations:
718
+ with st.expander(f"📚 {len(citations)} Sources"):
719
+ for i, cite in enumerate(citations, 1):
720
+ st.markdown(f"**[{i}] Page {cite['page']}** (Relevance: {cite['score']:.3f})")
721
+ st.text(cite['text'][:200] + "..." if len(cite['text']) > 200 else cite['text'])
722
+ st.divider()
723
+
724
+ # Prevent legacy UI from rendering below
725
+ return
726
+
727
+
728
+ def main():
729
+ # Configuration section for missing environment variables
730
+ openai_key = os.environ.get('OPENAI_API_KEY', '')
731
+ if not openai_key or openai_key == 'your-openai-api-key-here':
732
+ st.error("🔑 **OpenAI API Key Required**")
733
+ st.markdown("""
734
+ Please set your OpenAI API key:
735
+ 1. Add `OPENAI_API_KEY=your-key-here` to the `.env` file, OR
736
+ 2. Set it as an environment variable in your deployment platform
737
+ """)
738
+
739
+ # Quick input for testing
740
+ with st.expander("💡 Quick Setup (for testing)"):
741
+ key_input = st.text_input("Enter OpenAI API Key:", type="password")
742
+ if st.button("Set API Key") and key_input:
743
+ os.environ['OPENAI_API_KEY'] = key_input
744
+ st.success("✅ API Key set! Initializing system...")
745
+ st.rerun()
746
+ st.stop()
747
+
748
+ # Initialize system
749
+ if not st.session_state.rag_system:
750
+ if not init_rag_system():
751
+ st.stop()
752
+
753
+ # Header
754
+ st.markdown("""
755
+ <div class="main-header">
756
+ <h1>🤖 QUADRANT RAG - Document AI Assistant</h1>
757
+ <p>Powered by Qdrant Vector Database & OpenAI GPT-5-mini</p>
758
+ </div>
759
+ """, unsafe_allow_html=True)
760
+
761
+ # Sidebar
762
+ render_sidebar()
763
+
764
+ # Main content
765
+ render_chat_interface()
766
+
767
+ if __name__ == "__main__":
768
+ main()
data/docs.json ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "docs": [
3
+ {
4
+ "id": "f3dd7bef91b9",
5
+ "title": "[https___github.com_niki-amini-naieni_CountVid](ht.pdf",
6
+ "filename": "f3dd7bef91b9.pdf",
7
+ "path": "/Users/athulnambiar/Desktop/PROJECTS/QUADRANT_RAG/uploads/f3dd7bef91b9.pdf",
8
+ "pages": 4,
9
+ "created_at": "2025-09-22T17:04:06.943407Z"
10
+ },
11
+ {
12
+ "id": "639a85dd8f1f",
13
+ "title": "AI-Powered Company Classification Tool_ Technical Architecture & Data Flow-2.pdf",
14
+ "filename": "639a85dd8f1f.pdf",
15
+ "path": "/Users/athulnambiar/Desktop/PROJECTS/QUADRANT_RAG/uploads/639a85dd8f1f.pdf",
16
+ "pages": 2,
17
+ "created_at": "2025-09-22T18:26:30.613458Z"
18
+ },
19
+ {
20
+ "id": "50f6dc85d40b",
21
+ "title": "2262041-Athul Nambiar Final-2.pdf",
22
+ "filename": "50f6dc85d40b.pdf",
23
+ "pages": 30,
24
+ "chunks": 24,
25
+ "created_at": "2025-09-23T17:16:16.652638",
26
+ "enhanced": true,
27
+ "medical_optimized": true
28
+ },
29
+ {
30
+ "id": "be191882380f",
31
+ "title": "MVP Development for AI-Powered Company Classification Tool.pdf",
32
+ "filename": "be191882380f.pdf",
33
+ "pages": 4,
34
+ "chunks": 11,
35
+ "created_at": "2025-09-23T17:37:46.782738+00:00",
36
+ "dynamic_rag": true,
37
+ "vector_db": "qdrant",
38
+ "llm_model": "gpt-4o-mini"
39
+ },
40
+ {
41
+ "id": "ff610e38e8b3",
42
+ "title": "1. Christ University Policy on Research Internships for Undergraduate and Postgraduate Students.pdf",
43
+ "filename": "ff610e38e8b3.pdf",
44
+ "pages": 4,
45
+ "chunks": 12,
46
+ "created_at": "2025-09-25T19:04:11.467669+00:00",
47
+ "dynamic_rag": true,
48
+ "vector_db": "qdrant",
49
+ "llm_model": "gpt-4o-mini"
50
+ },
51
+ {
52
+ "id": "bec87b0367ac",
53
+ "title": "Mini-project Report front page_Image and Video Analytics.pdf",
54
+ "filename": "bec87b0367ac.pdf",
55
+ "pages": 7,
56
+ "chunks": 6,
57
+ "created_at": "2025-09-25T19:16:09.040014+00:00",
58
+ "dynamic_rag": true,
59
+ "vector_db": "qdrant",
60
+ "llm_model": "gpt-4o-mini"
61
+ },
62
+ {
63
+ "id": "0fd6df206ce2",
64
+ "title": "Mini-project Report front page_Image and Video Analytics.pdf",
65
+ "filename": "0fd6df206ce2.pdf",
66
+ "pages": 7,
67
+ "chunks": 6,
68
+ "created_at": "2025-09-25T19:19:09.454543+00:00",
69
+ "dynamic_rag": true,
70
+ "vector_db": "qdrant",
71
+ "llm_model": "gpt-4o-mini"
72
+ },
73
+ {
74
+ "id": "9ef4a5805de2",
75
+ "title": "Harrison\u2019s Principles of Internal Medicine, 21st edition -- By Joseph Loscalzo, Anthony Fauci, Dennis Kasper, Stephen -- 21th edition, New York, -- 9781259644030 -- 201ae48755c6d5e6625a0d4018a77f49 -- Anna\u2019s Archive_split-merge.pdf",
76
+ "filename": "9ef4a5805de2.pdf",
77
+ "pages": 51,
78
+ "chunks": 491,
79
+ "created_at": "2025-09-25T19:21:14.181613+00:00",
80
+ "dynamic_rag": true,
81
+ "vector_db": "qdrant",
82
+ "llm_model": "gpt-4o-mini"
83
+ },
84
+ {
85
+ "id": "83c48ac2fa2f",
86
+ "title": "Harrison\u2019s Principles of Internal Medicine, 21st edition -- By Joseph Loscalzo, Anthony Fauci, Dennis Kasper, Stephen -- 21th edition, New York, -- 9781259644030 -- 201ae48755c6d5e6625a0d4018a77f49 -- Anna\u2019s Archive_split-merge.pdf",
87
+ "filename": "83c48ac2fa2f.pdf",
88
+ "pages": 51,
89
+ "chunks": 491,
90
+ "created_at": "2025-09-26T07:13:09.540473+00:00",
91
+ "dynamic_rag": true,
92
+ "vector_db": "qdrant",
93
+ "llm_model": "gpt-4o-mini"
94
+ },
95
+ {
96
+ "id": "3d5f4faa9630",
97
+ "title": "Harrison\u2019s Principles of Internal Medicine, 21st edition -- By Joseph Loscalzo, Anthony Fauci, Dennis Kasper, Stephen -- 21th edition, New York, -- 9781259644030 -- 201ae48755c6d5e6625a0d4018a77f49 -- Anna\u2019s Archive_split-merge.pdf",
98
+ "filename": "3d5f4faa9630.pdf",
99
+ "pages": 51,
100
+ "chunks": 491,
101
+ "created_at": "2025-09-26T11:37:30.064125+00:00",
102
+ "dynamic_rag": true,
103
+ "vector_db": "qdrant",
104
+ "llm_model": "gpt-4o-mini"
105
+ },
106
+ {
107
+ "id": "91e53cac0db8",
108
+ "title": "Harrison\u2019s Principles of Internal Medicine, 21st edition -- By Joseph Loscalzo, Anthony Fauci, Dennis Kasper, Stephen -- 21th edition, New York, -- 9781259644030 -- 201ae48755c6d5e6625a0d4018a77f49 -- Anna\u2019s Archive_split-merge.pdf",
109
+ "filename": "91e53cac0db8.pdf",
110
+ "pages": 51,
111
+ "chunks": 247,
112
+ "created_at": "2025-09-26T14:50:49.748305+00:00",
113
+ "dynamic_rag": true,
114
+ "vector_db": "qdrant",
115
+ "llm_model": "gpt-4o-mini"
116
+ },
117
+ {
118
+ "id": "e42111b7b18b",
119
+ "title": "Harrison\u2019s Principles of Internal Medicine, 21st edition -- By Joseph Loscalzo, Anthony Fauci, Dennis Kasper, Stephen -- 21th edition, New York, -- 9781259644030 -- 201ae48755c6d5e6625a0d4018a77f49 -- Anna\u2019s Archive_split-merge.pdf",
120
+ "filename": "e42111b7b18b.pdf",
121
+ "pages": 51,
122
+ "chunks": 247,
123
+ "created_at": "2025-09-26T14:54:02.811504+00:00",
124
+ "dynamic_rag": true,
125
+ "vector_db": "qdrant",
126
+ "llm_model": "gpt-4o-mini"
127
+ },
128
+ {
129
+ "id": "b78c65d6b361",
130
+ "title": "Harrison\u2019s Principles of Internal Medicine, 21st edition -- By Joseph Loscalzo, Anthony Fauci, Dennis Kasper, Stephen -- 21th edition, New York, -- 9781259644030 -- 201ae48755c6d5e6625a0d4018a77f49 -- Anna\u2019s Archive_split-merge.pdf",
131
+ "filename": "b78c65d6b361.pdf",
132
+ "pages": 51,
133
+ "chunks": 247,
134
+ "created_at": "2025-09-26T14:54:03.053592+00:00",
135
+ "dynamic_rag": true,
136
+ "vector_db": "qdrant",
137
+ "llm_model": "gpt-4o-mini"
138
+ },
139
+ {
140
+ "id": "6200aee12faa",
141
+ "title": "Harrison\u2019s Principles of Internal Medicine, 21st edition -- By Joseph Loscalzo, Anthony Fauci, Dennis Kasper, Stephen -- 21th edition, New York, -- 9781259644030 -- 201ae48755c6d5e6625a0d4018a77f49 -- Anna\u2019s Archive_split-merge.pdf",
142
+ "filename": "6200aee12faa.pdf",
143
+ "pages": 51,
144
+ "chunks": 247,
145
+ "created_at": "2025-09-26T15:29:57.820092+00:00",
146
+ "dynamic_rag": true,
147
+ "vector_db": "qdrant",
148
+ "llm_model": "gpt-4o-mini"
149
+ },
150
+ {
151
+ "id": "8888d36bfc6e",
152
+ "title": "Harrison\u2019s Principles of Internal Medicine, 21st edition -- By Joseph Loscalzo, Anthony Fauci, Dennis Kasper, Stephen -- 21th edition, New York, -- 9781259644030 -- 201ae48755c6d5e6625a0d4018a77f49 -- Anna\u2019s Archive_split-merge.pdf",
153
+ "filename": "8888d36bfc6e.pdf",
154
+ "pages": 51,
155
+ "chunks": 247,
156
+ "created_at": "2025-09-29T16:24:46.915516+00:00",
157
+ "dynamic_rag": true,
158
+ "vector_db": "qdrant",
159
+ "llm_model": "gpt-4o-mini"
160
+ },
161
+ {
162
+ "id": "9b2ccd8b9cc9",
163
+ "title": "Harrison\u2019s Principles of Internal Medicine, 21st edition -- By Joseph Loscalzo, Anthony Fauci, Dennis Kasper, Stephen -- 21th edition, New York, -- 9781259644030 -- 201ae48755c6d5e6625a0d4018a77f49 -- Anna\u2019s Archive_split-merge.pdf",
164
+ "filename": "9b2ccd8b9cc9.pdf",
165
+ "pages": 51,
166
+ "chunks": 247,
167
+ "created_at": "2025-09-29T16:42:35.313291+00:00",
168
+ "dynamic_rag": true,
169
+ "vector_db": "qdrant",
170
+ "llm_model": "gpt-4o-mini"
171
+ }
172
+ ],
173
+ "chunks": {
174
+ "50f6dc85d40b": [
175
+ {
176
+ "id": "50f6dc85d40b_0",
177
+ "text": "SORCOVA HEALTH DEV AN INTERNSHIP REPORT Submitted by ATHUL NAMBIAR (2262041) In partial fulfilment of the requirements for the degree of BACHELOR OF TECHNOLOGY IN COMPUTER SCIENCE AND ENGINEERING (Artificial Intelligence and Machine Learning) UNDER THE GUIDANCE OF PROF. CHAITRA P C Department of Computer Science and Engineering, School of Engineering and Technology CHRIST (Deemed to be University) Kumbalgodu, 560074 August 2025",
178
+ "page": 1,
179
+ "section_title": "Introduction",
180
+ "medical_terms": [],
181
+ "chunk_type": "general",
182
+ "relevance_score": 0.0
183
+ },
184
+ {
185
+ "id": "50f6dc85d40b_1",
186
+ "text": "Department of Computer Science and Engineering School of Engineering and Technology CHRIST (Deemed to be University) BONAFIDE CERTIFICATE This is to certify that ATHUL NAMBIAR (2262041) has successfully completed his summer internship work entitled \u201cSORCOVA HEALTH DEV\u201d in Sorcova Health located at 60 Avenue Charles de Gaulle, 92200 Neuilly-sur-Seine, France from 03-03-2025 to 31-05-2025 for a duration of 3 Months and submitted on 14-08-2025 in partial fulfillment for the award of Bachelor of Technology in Computer Science and Engineering (Artificial Intelligence and Machine Learning) during the academic year . GUIDE (Signature with Date) HEAD OF THE DEPARTMENT (Signature with Seal) EXAMINER 1 (Name & Signature with Date) Examiner 2 (Name & Signature with Date) 2024-2025",
187
+ "page": 2,
188
+ "section_title": "Introduction",
189
+ "medical_terms": [],
190
+ "chunk_type": "general",
191
+ "relevance_score": 0.0
192
+ },
193
+ {
194
+ "id": "50f6dc85d40b_2",
195
+ "text": "ABSTRACT During my internship at Socvova Health as a Fullstack Developer, I gained invaluable experience by contributing to the development of an innovative technology networking platform. I enhanced my skills in advanced backend development, server-side optimisation, and API integration while collaborating with a dynamic team to drive technological innovation and connectivity among global tech enthusiasts. This experience allowed me to apply theoretical knowledge to real-world applications, solidifying my passion for backend development and inspiring me to pursue further innovation in this field. My work involved designing and testing server-side components, optimising performance, and ensuring code quality, which prepared me for future challenges in the tech industry.This hands-on experience not only enriched my technical expertise but also emphasised the importance of innovation and collaboration in the tech industry.",
196
+ "page": 3,
197
+ "section_title": "Introduction",
198
+ "medical_terms": [],
199
+ "chunk_type": "general",
200
+ "relevance_score": 0.0
201
+ },
202
+ {
203
+ "id": "50f6dc85d40b_3",
204
+ "text": "PREFACE During my internship at Socvova Health as a Fullstack Developer from March 03, 2025, to May 31, 2025, I had the opportunity to work with a dynamic team committed to technological innovation. This experience allowed me to apply my theoretical knowledge in Computer Science Engineering to real-world backend development projects. I contributed to the development of a global technology networking platform, enhancing my skills in Node.js and server-side optimisation. Collaborating with visionary leaders and talented colleagues provided invaluable insights into the tech industry, solidifying my passion for backend development and inspiring me to pursue further innovation in this field. This report encapsulates the skills and knowledge I gained, the challenges I encountered, and the solutions I implemented during this transformative period.",
205
+ "page": 4,
206
+ "section_title": "Introduction",
207
+ "medical_terms": [],
208
+ "chunk_type": "general",
209
+ "relevance_score": 0.0
210
+ },
211
+ {
212
+ "id": "50f6dc85d40b_4",
213
+ "text": "INDEX CONTENT PAGE NO. Abstract I Preface II List of Symbols and Abbreviations III List of Figures IV List of Tables V 1 Introduction 1 1.1. Objectives 1 1.2. Company Profile 2 2 Technical Description & Implementation 3 2.1. Actual Work 3 2.1.1. Prerequisites 3 2.1.2. Responsibilities 4 2.1.3. Challenges 4 2.1.4 Implementation Description 4 2.1.5 Methodology 2.2. Learning Outcomes & Key Takeaway 3 Conclusion & Future Scope 5 3.1. Conclusion 5 3.2. Future Scope 5 4 Appendixes A Acceptance letter 7 B Completion Certificate 8 C Code and screenshots 9 5 References 10",
214
+ "page": 5,
215
+ "section_title": "Introduction",
216
+ "medical_terms": [],
217
+ "chunk_type": "conclusion",
218
+ "relevance_score": 0.0
219
+ },
220
+ {
221
+ "id": "50f6dc85d40b_5",
222
+ "text": "1.2 COMPANY PROFILE Sorcova Health is a digital health platform registered in France focused on monitoring mental well-being, managing stress, and implementing personalized prevention strategies to mitigate chronic stress and burnout. Their mission is to address chronic stress, a significant risk factor for various non-communicable diseases, by using preventative health and lifestyle medicine approaches to improve long-term health outcomes and enhance overall quality of life.",
223
+ "page": 8,
224
+ "section_title": "4. Troubleshooting and Support: Performed troubles",
225
+ "medical_terms": [],
226
+ "chunk_type": "general",
227
+ "relevance_score": 0.0
228
+ },
229
+ {
230
+ "id": "50f6dc85d40b_6",
231
+ "text": "2.1.2 RESPONSIBILITIES My responsibilities included: During my internship as a Fullstack Developer at Sorcova Health, my primary responsibilities revolved around the successful execution of assigned tasks within specified deadlines. These tasks included designing, developing, and testing server-side components, ensuring optimal performance of database and front-end requests, and integrating server-side logic with user-side components. I was also responsible for writing clean, efficient, and reusable code, developing APIs as needed, and troubleshooting technical issues independently before seeking assistance. Maintaining consistent communication with my mentor through daily reports was crucial in tracking progress and ensuring alignment with project goals. This proactive approach fostered a sense of self-reliance and resourcefulness, enabling me to contribute effectively to the development of the Sorcova Health platform. Feel free to modify or expand on this as needed to better fit your personal experiences and contributions during your internship.",
232
+ "page": 10,
233
+ "section_title": "CHAPTER 2 TECHNICAL DESCRIPTION & IMPLEMENTATION 2",
234
+ "medical_terms": [],
235
+ "chunk_type": "general",
236
+ "relevance_score": 0.0
237
+ },
238
+ {
239
+ "id": "50f6dc85d40b_7",
240
+ "text": "2.1.3 CHALLENGES I encountered several challenges that tested my problem-solving skills and adaptability. One significant challenge was managing data inconsistencies and ensuring data integrity across the platform. This involved dealing with various data formats and ensuring seamless integration with existing systems. Another challenge was optimising server-side performance to handle high traffic efficiently, which required a deep understanding of backend architecture and careful resource management. Additionally, selecting the appropriate technologies and frameworks for developing scalable backend solutions was a complex task, as it involved balancing performance, security, and maintainability. These challenges, while demanding, provided valuable learning experiences and contributed to my growth as a backend developer. Feel free to adjust or expand on any specific details based on your personal experience during the internship.",
241
+ "page": 11,
242
+ "section_title": "CHAPTER 2 TECHNICAL DESCRIPTION & IMPLEMENTATION 2",
243
+ "medical_terms": [],
244
+ "chunk_type": "general",
245
+ "relevance_score": 0.0
246
+ },
247
+ {
248
+ "id": "50f6dc85d40b_8",
249
+ "text": "2.1.4 IMPLEMENTATION DESCRIPTION During the internship at Sorcova Health, the implementation primarily focused on developing and enhancing a technology networking platform, particularly its backend functionalities and API integrations. The core of the implementation involved leveraging Node.js for server-side development, as evidenced by the server.js file, which sets up the Express.js server, handles routing for users, emails, and comments, and integrates with dbConfigure.js for database connectivity. Key aspects included creating and managing RESTful APIs for various functionalities, such as retrieving, adding, updating, and deleting announcements, with specific endpoints for managing announcement views and counts. The implementation also incorporated real-time communication through Socket.io for features like instant announcement notifications. Database interactions, likely with a SQL database based on the structure of announcement.model.js and the use of SQL queries, were central to managing user data, announcements, and their seen/unseen status.",
250
+ "page": 12,
251
+ "section_title": "CHAPTER 2 TECHNICAL DESCRIPTION & IMPLEMENTATION 2",
252
+ "medical_terms": [],
253
+ "chunk_type": "general",
254
+ "relevance_score": 0.0
255
+ },
256
+ {
257
+ "id": "50f6dc85d40b_9",
258
+ "text": "al-time communication through Socket.io for features like instant announcement notifications. Database interactions, likely with a SQL database based on the structure of announcement.model.js and the use of SQL queries, were central to managing user data, announcements, and their seen/unseen status. Furthermore, the development process involved a focus on code quality, reusability, and performance optimization for both database and front-end requests, aiming to translate Figma designs into functional web applications. API testing was conducted using Postman to ensure the robustness and correctness of the implemented endpoints.",
259
+ "page": 12,
260
+ "section_title": "CHAPTER 2 TECHNICAL DESCRIPTION & IMPLEMENTATION 2",
261
+ "medical_terms": [],
262
+ "chunk_type": "general",
263
+ "relevance_score": 0.0
264
+ },
265
+ {
266
+ "id": "50f6dc85d40b_10",
267
+ "text": "2.1.5 METHODOLOGY The development process at Sorcova Health followed an agile methodology, emphasizing iterative development and continuous feedback to translate Figma workflow designs into functional web applications. Backend development was primarily conducted using Node.js and its Express.js framework for building robust RESTful APIs. For data persistence and retrieval, SQL databases were utilized, with database interactions managed through custom models and direct SQL queries to optimize performance and ensure data integrity. Version control was rigorously maintained using Git, facilitating collaborative development, code merging, and tracking changes. Real-time communication features, such as instant announcement notifications, were implemented using Socket.io. Throughout the implementation, a strong emphasis was placed on writing clean, efficient, and reusable code, while API testing was systematically performed using Postman to validate endpoint functionality and overall system reliability.",
268
+ "page": 13,
269
+ "section_title": "CHAPTER 2 TECHNICAL DESCRIPTION & IMPLEMENTATION 2",
270
+ "medical_terms": [],
271
+ "chunk_type": "general",
272
+ "relevance_score": 0.0
273
+ },
274
+ {
275
+ "id": "50f6dc85d40b_11",
276
+ "text": "C. CODE & SCREEN SHOT server.js require(\"dotenv\").config() var cors = require(\"cors\") const express = require(\"express\") const db = require(\"./configure/dbConfigure.js\") const logger = require(\"./logger\") const bodyParser = require(\"body-parser\") const https = require(\"http\") const { readFileSync } = require(\"fs\") const path = require(\"path\") const userRoutes = require(\"./routes/user\"); const emailRoutes = require(\"./routes/email.js\"); const commentRoutes = require('./routes/comments.js'); const app = express() app.use(bodyParser.json()) app.use(bodyParser.urlencoded({ extended: false, limit: \"2gb\" })) app.use(bodyParser.json({ type: \"application/*+json\" })) const sslserver = https.createServer( { key: readFileSync(path.join(__dirname, \"certs\", \"key.pem\")), cert: readFileSync(path.join(__dirname, \"certs\", \"cert.pem\")), }, app ) global.io = require(\"socket.io\")(sslserver, { cors: { origin: \"*\", methods: [\"GET\", \"POST\"], },",
277
+ "page": 17,
278
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
279
+ "medical_terms": [],
280
+ "chunk_type": "general",
281
+ "relevance_score": 0.0
282
+ },
283
+ {
284
+ "id": "50f6dc85d40b_12",
285
+ "text": "}) app.use( cors({ origin: \"*\", }) ) app.use(function (req, res, next) { res.header(\"Access-Control-Allow-Origin\", [\"*\"]) res.header( \"Access-Control-Allow-Headers\", \"Origin,X-Requested-With, Content-Type, Accept\" ) next() }) app.use(express.urlencoded({ extended: true, limit: \"2gb\" })) app.use(express.json()) app.use(\"/users\", userRoutes) app.get(\"/testing\", (req, res) => { res.send(\"Hello Sorcova Healths\").status(200) }) app.get(\"/projection\", (req, res) => { res.send(\"this is the projection page\").status(200) }) app.use(express.static(path.join(__dirname, \"build\"))) //here is important thing - no static directory, because all static :) app.get(\"/*\", function (req, res) { res.sendFile(path.join(__dirname, \"index.html\")) })",
286
+ "page": 18,
287
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
288
+ "medical_terms": [],
289
+ "chunk_type": "general",
290
+ "relevance_score": 0.0
291
+ },
292
+ {
293
+ "id": "50f6dc85d40b_13",
294
+ "text": "io.on(\"connection\", (socket) => { socket.on(\"announcementAdded\", () => { console.log(\"New announcement added\") io.emit(\"newAnnouncement\", \"New Announcement!\") }) }) const port = process.env.PORT || 8080 sslserver.listen(port, () => { logger.info(`server started at port : ${port}`) }) app.use('/email',emailRoutes); app.use('/comments', commentRoutes); sslserver.setTimeout(300000) Notification Controller API const AnnouncementsModel = require(\"../models/announcement.model\"); const logger = require(\"../logger\"); exports.socketAnnounce = (req, res) => { const {password} = req.body //Sorcova Health@2023 const hash = \"\" try{ io.emit(\"newAnnouncement\", \"New Announcement!\"); res.status(200).json({status:true, msg:\"announcement emitted\"}) }catch(err){ console.log(err) res.status(500).json({status:false, msg:\"failed to emmit announcement \"}) }",
295
+ "page": 19,
296
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
297
+ "medical_terms": [],
298
+ "chunk_type": "general",
299
+ "relevance_score": 0.0
300
+ },
301
+ {
302
+ "id": "50f6dc85d40b_14",
303
+ "text": "} exports.getAllAnnouncements = (req, res) => { const mad_id = req.params.id; AnnouncementsModel.getAllAnnouncements(mad_id,(err, result) => { try { if (err) { logger.error(\"Error: announcements >> error >>\", err); res.json({ response: err, status: { code: \"02\", status: \"failure\", message: \"error in fetching all announcements\", }, }); } else { res.json({ response: result, status: { code: \"00\", status: \"success\", message: \"Fetched all announcements successfully\", }, }); } } catch (err) { logger.error(\"Error: get All announcements >> error >>\", err); } }); }; exports.getAnnouncementById = (req, res) => { const announcement_id = req.params.id AnnouncementsModel.getAnnouncementById(announcement_id, (err, result) => {",
304
+ "page": 20,
305
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
306
+ "medical_terms": [],
307
+ "chunk_type": "general",
308
+ "relevance_score": 0.0
309
+ },
310
+ {
311
+ "id": "50f6dc85d40b_15",
312
+ "text": "try { if (err) { logger.error(\"Error: announcements >> error >>\", err); res.json({ response: err, status: { code: \"02\", status: \"failure\", message: \"error in fetching announcement by id\", }, }); } else { res.json({ response: result, status: { code: \"00\", status: \"success\", message: \"Fetched announcement by id successfully\", }, }); } } catch (err) { logger.error(\"Error: get announcement by id >> error >>\", err); } }); }; exports.removeOutdatedAnnouncements = (req, res) => { AnnouncementsModel.removeOutdatedAnnouncements((err, result) => { try { if (err) { logger.error(\"Error: remove outdated announcements >> error >>\", err); res.json({ response: err, status: {",
313
+ "page": 21,
314
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
315
+ "medical_terms": [],
316
+ "chunk_type": "general",
317
+ "relevance_score": 0.0
318
+ },
319
+ {
320
+ "id": "50f6dc85d40b_16",
321
+ "text": "code: \"02\", status: \"failure\", message: \"error in deleting expired Documents\", }, }); } else { res.json({ response: result, status: { code: \"00\", status: \"success\", message: \"Deleted outdated announcements successfully\", }, }); } } catch (err) { logger.error(\"Error: outdated announcements >> error >>\", err); } }); }; exports.deleteAnnouncementById = (req, res) => { const announcement_id = req.params.id; AnnouncementsModel.deleteAnnouncementById(announcement_id, (err, result) => { try { if (err) { logger.error(\"Error: remove outdated announcements >> error >>\", err); res.json({ response: err, status: { code: \"02\", status: \"failure\", message: \"error in deleting expired Documents\", }, });",
322
+ "page": 22,
323
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
324
+ "medical_terms": [],
325
+ "chunk_type": "general",
326
+ "relevance_score": 0.0
327
+ },
328
+ {
329
+ "id": "50f6dc85d40b_17",
330
+ "text": "} else { res.json({ response: result, status: { code: \"00\", status: \"success\", message: \"Deleted outdated announcements successfully\", }, }); } } catch (err) { logger.error(\"Error: outdated announcements >> error >>\", err); } }); }; exports.getUnseenAnnouncementCount = (req, res) => { const mad_id = req.params.id; AnnouncementsModel.getUnseenAnnouncementCount(mad_id, (err, result) => { try { if (err) { logger.error(\"Error: get unseen announcements count >> error >>\", err); res.json({ response: err, status: { code: \"02\", status: \"failure\", message: \"error getting unseen announcements count\", }, }); } else { res.json({ response: result[0]??null, status: { code: \"00\", status: \"success\",",
331
+ "page": 23,
332
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
333
+ "medical_terms": [],
334
+ "chunk_type": "general",
335
+ "relevance_score": 0.0
336
+ },
337
+ {
338
+ "id": "50f6dc85d40b_18",
339
+ "text": "message: \"successfully fetched unseen announcements count\", }, }); } } catch (err) { logger.error(\"Error: get unseen announcements count >> error >>\", err); } }); }; exports.markSeenAnnouncements = (req, res) => { const {announcement_ids, mad_id} = req.body AnnouncementsModel.markSeenAnnouncements(announcement_ids,mad_id, (err, result) => { try { if (err) { logger.error(\"Error: mark seen announcements >> error >>\", err); res.json({ response: err, status: { code: \"02\", status: \"failure\", message: \"error marking seen announcements\", }, }); } else { res.json({ response: result, status: { code: \"00\", status: \"success\", message: \"successfully marked seen announcements\", }, }); } } catch (err) {",
340
+ "page": 24,
341
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
342
+ "medical_terms": [],
343
+ "chunk_type": "general",
344
+ "relevance_score": 0.0
345
+ },
346
+ {
347
+ "id": "50f6dc85d40b_19",
348
+ "text": "logger.error(\"Error: mark seen announcements >> error >>\", err); } }); }; DB SQL Model Schema const db = require(\"../configure/dbConfigure\"); const logger = require(\"../logger\"); const Announcements = function (announcement) { this.announcement_id = announcement.announcement_id; this.announcement_type = announcement.announcement_type; this.channel = announcement.channel; this.created_date = announcement.created_date; this.duration = announcement.duration; this.priority = announcement.priority; this.content = announcement.content; }; //old // Announcements.getAllAnnouncements = (mad_id, result) => { // var sql = `SELECT A.*,CASE WHEN Av.mad_id IS NOT NULL THEN 'seen' ELSE 'unseen' END AS seen_status // FROM announcements as A Left Join announcements_views as Av // ON A.announcement_id = Av.announcement_id And Av.mad_id = ${mad_id} // WHERE A.announcement_id NOT IN ( // SELECT announcement_id // FROM announcements // WHERE created_date < NOW() - INTERVAL 30 DAY) // ORDER BY A.created_date DESC;`;",
349
+ "page": 25,
350
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
351
+ "medical_terms": [],
352
+ "chunk_type": "general",
353
+ "relevance_score": 0.0
354
+ },
355
+ {
356
+ "id": "50f6dc85d40b_20",
357
+ "text": "// db.query(sql, (err, res) => { // if (err) { // logger.error(err); // result(err, null); // } else { // result(null, res); // } // }); // }; //new - Test pending Announcements.getAllAnnouncements = (mad_id, result) => { var sql = `SELECT A.*, CASE WHEN Av.mad_id IS NOT NULL THEN 'seen' ELSE 'unseen' END AS seen_status FROM announcements AS A LEFT JOIN ( SELECT announcement_id FROM announcements_views WHERE mad_id = ${mad_id} ) AS Av ON A.announcement_id = Av.announcement_id WHERE A.created_date >= NOW() - INTERVAL 30 DAY OR Av.announcement_id IS NOT NULL ORDER BY A.created_date DESC;`; db.query(sql, (err, res) => { if (err) { logger.error(err); result(err, null); } else { result(null, res); } }); }; Announcements.getAnnouncementById = (announcement_id, result) => {",
358
+ "page": 26,
359
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
360
+ "medical_terms": [],
361
+ "chunk_type": "general",
362
+ "relevance_score": 0.0
363
+ },
364
+ {
365
+ "id": "50f6dc85d40b_21",
366
+ "text": "const sql = `Select * from announcements where announcement_id = ${announcement_id}`; db.query(sql, (err, res) => { if (err) { logger.error(err); result(err, null); } else { result(null, res); } }); }; Announcements.removeOutdatedAnnouncements = (result) => { const sql = `DELETE from announcements WHERE created_date < NOW() - INTERVAL duration DAY;`; db.query(sql, (err, res) => { if (err) { logger.error(err); result(err, null); } else { result(null, res); } }); }; Announcements.deleteAnnouncementById = (announcement_id, result) => { const sql = `DELETE from announcements WHERE announcement_id = ${announcement_id}`; db.query(sql, (err, res) => { if (err) { logger.error(err); result(err, null); } else { result(null, res); } });",
367
+ "page": 27,
368
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
369
+ "medical_terms": [],
370
+ "chunk_type": "general",
371
+ "relevance_score": 0.0
372
+ },
373
+ {
374
+ "id": "50f6dc85d40b_22",
375
+ "text": "}; // old // Announcements.getUnseenAnnouncementCount = (mad_id, result) => { // var sql = `SELECT count(*) as unseenCount // FROM announcements as A // WHERE A.announcement_id NOT IN ( // SELECT announcement_id // FROM announcements // WHERE created_date < NOW() - INTERVAL 30 DAY) // AND Not Exists (select * from announcements_views where announcement_id = A.announcement_id and mad_id = ${mad_id} ) // ORDER BY created_date DESC;`; // db.query(sql, (err, res) => { // if (err) { // logger.error(err); // result(err, null); // } else { // result(null, res); // } // }); // }; //new Announcements.getUnseenAnnouncementCount = (mad_id, result) => { var sql = `SELECT COUNT(*) AS unseenCount FROM announcements AS A LEFT JOIN ( SELECT announcement_id FROM announcements_views WHERE mad_id = ${mad_id} ) AS Av ON A.announcement_id = Av.announcement_id WHERE A.created_date >= NOW() - INTERVAL 30 DAY AND Av.announcement_id IS NULL;`;",
376
+ "page": 28,
377
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
378
+ "medical_terms": [],
379
+ "chunk_type": "general",
380
+ "relevance_score": 0.0
381
+ },
382
+ {
383
+ "id": "50f6dc85d40b_23",
384
+ "text": "db.query(sql, (err, res) => { if (err) { logger.error(err); result(err, null); } else { result(null, res); } }); }; Announcements.markSeenAnnouncements = (announcement_ids,mad_id, result) => { var values = announcement_ids.map(announcement_id => `(${announcement_id}, ${mad_id})`).join(', '); var sql = `INSERT INTO announcements_views (announcement_id, mad_id) VALUES ${values}`; db.query(sql, (err, res) => { if (err) { logger.error(err); result(err, null); } else { result(null, res); } }); }; module.exports = Announcements;",
385
+ "page": 29,
386
+ "section_title": "CHAPTER 3 CONCLUSION & FUTURE SCOPE 3.1 CONCLUSION",
387
+ "medical_terms": [],
388
+ "chunk_type": "general",
389
+ "relevance_score": 0.0
390
+ }
391
+ ]
392
+ }
393
+ }
rag_core.py ADDED
@@ -0,0 +1,685 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ QUADRANT RAG Core Module
4
+ Clean RAG implementation without Flask dependencies
5
+ Optimized for both Streamlit and Flask integration
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import uuid
11
+ import re
12
+ import time
13
+ from typing import List, Dict, Any, Optional
14
+ from pathlib import Path
15
+ from datetime import datetime, timezone
16
+
17
+ from dotenv import load_dotenv
18
+ from qdrant_client import QdrantClient
19
+ from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue, PayloadSchemaType
20
+ import openai
21
+
22
+ # Load environment variables
23
+ load_dotenv()
24
+
25
+ class DynamicRAG:
26
+ """
27
+ Dynamic RAG System with Qdrant Vector Database and OpenAI GPT-4o-mini
28
+ Real semantic search with proper LLM responses
29
+ """
30
+
31
+ def __init__(self):
32
+ # Environment variables
33
+ self.openai_api_key = os.environ.get('OPENAI_API_KEY', 'your-openai-api-key-here')
34
+ self.use_memory_db = os.environ.get('USE_MEMORY_DB', 'false').lower() == 'true'
35
+ self.qdrant_url = os.environ.get('QDRANT_URL')
36
+ self.qdrant_api_key = os.environ.get('QDRANT_API_KEY')
37
+ self.collection_name = os.environ.get('QDRANT_COLLECTION_NAME', 'documents')
38
+
39
+ # Initialize clients
40
+ self._init_openai()
41
+ self._init_qdrant()
42
+ self._init_embedding_model()
43
+
44
+ # Ensure collection exists
45
+ self.ensure_collection()
46
+
47
+ def _init_openai(self):
48
+ """Initialize OpenAI client"""
49
+ try:
50
+ if not self.openai_api_key or self.openai_api_key == 'your-openai-api-key-here':
51
+ print("❌ OpenAI API key not provided. Please set OPENAI_API_KEY environment variable.")
52
+ self.openai_client = None
53
+ return
54
+
55
+ openai.api_key = self.openai_api_key
56
+ self.openai_client = openai.OpenAI(api_key=self.openai_api_key)
57
+ print("✅ OpenAI client initialized")
58
+ except Exception as e:
59
+ print(f"⚠️ OpenAI initialization error: {e}")
60
+ self.openai_client = None
61
+
62
+ def _init_qdrant(self):
63
+ """Initialize Qdrant client with cloud priority"""
64
+ try:
65
+ # Configure client timeouts and transport
66
+ qdrant_timeout = float(os.environ.get('QDRANT_TIMEOUT', '60'))
67
+ prefer_grpc = os.environ.get('QDRANT_PREFER_GRPC', 'true').lower() == 'true'
68
+ if self.qdrant_url and self.qdrant_api_key:
69
+ print(f"🌐 Using Qdrant Cloud: {self.qdrant_url}")
70
+ self.qdrant_client = QdrantClient(
71
+ url=self.qdrant_url,
72
+ api_key=self.qdrant_api_key,
73
+ timeout=qdrant_timeout,
74
+ prefer_grpc=prefer_grpc,
75
+ )
76
+ elif self.use_memory_db:
77
+ print("💾 Using in-memory Qdrant (development only)")
78
+ self.qdrant_client = QdrantClient(":memory:", timeout=qdrant_timeout)
79
+ else:
80
+ # Fallback to local file storage
81
+ storage_path = os.environ.get('QDRANT_STORAGE_PATH', './qdrant_storage')
82
+ print(f"🗄️ Using file-based Qdrant storage: {storage_path}")
83
+ self.qdrant_client = QdrantClient(path=storage_path, timeout=qdrant_timeout)
84
+
85
+ print(f"✅ Qdrant client initialized (timeout={qdrant_timeout}s, gRPC preferred={prefer_grpc})")
86
+ except Exception as e:
87
+ print(f"❌ Qdrant initialization error: {e}")
88
+ raise
89
+
90
+ def _init_embedding_model(self):
91
+ """Initialize OpenAI embedding model settings"""
92
+ try:
93
+ print("🔄 Configuring OpenAI embeddings...")
94
+ self.embedding_model_name = 'text-embedding-3-small'
95
+ self.embedding_size = 1536 # OpenAI text-embedding-3-small dimension
96
+ # Chat model can be overridden via env; default per user request
97
+ self.chat_model_name = os.environ.get('OPENAI_COMPLETIONS_MODEL', 'gpt-5-mini')
98
+ print("✅ OpenAI embeddings configured")
99
+ except Exception as e:
100
+ print(f"❌ Embedding configuration error: {e}")
101
+ raise
102
+
103
+ def ensure_collection(self):
104
+ """Create Qdrant collection if it doesn't exist"""
105
+ try:
106
+ collections = self.qdrant_client.get_collections().collections
107
+ collection_names = [c.name for c in collections]
108
+
109
+ if self.collection_name not in collection_names:
110
+ print(f"🔄 Creating Qdrant collection: {self.collection_name}")
111
+ self.qdrant_client.create_collection(
112
+ collection_name=self.collection_name,
113
+ vectors_config=VectorParams(size=self.embedding_size, distance=Distance.COSINE)
114
+ )
115
+ print("✅ Collection created")
116
+ else:
117
+ print(f"✅ Collection {self.collection_name} already exists")
118
+
119
+ # Create payload index for doc_id to enable filtering
120
+ try:
121
+ self.qdrant_client.create_payload_index(
122
+ collection_name=self.collection_name,
123
+ field_name="doc_id",
124
+ field_schema=PayloadSchemaType.KEYWORD
125
+ )
126
+ print("✅ Created index for doc_id field")
127
+ except Exception as e:
128
+ # Index might already exist, which is fine
129
+ if "already exists" not in str(e):
130
+ print(f"⚠️ Note: Could not create index for doc_id: {e}")
131
+
132
+ except Exception as e:
133
+ print(f"⚠️ Error with collection: {e}")
134
+
135
+ def create_embeddings(self, texts: List[str]) -> List[List[float]]:
136
+ """Create embeddings for texts using OpenAI API with batch processing"""
137
+ # Handle empty texts
138
+ texts = [text if text.strip() else "empty" for text in texts]
139
+
140
+ # Process in batches to avoid timeout
141
+ batch_size = 20 # OpenAI recommends smaller batches
142
+ all_embeddings = []
143
+
144
+ for i in range(0, len(texts), batch_size):
145
+ batch = texts[i:i + batch_size]
146
+ retries = 3
147
+
148
+ while retries > 0:
149
+ try:
150
+ # Create embeddings for this batch
151
+ response = self.openai_client.embeddings.create(
152
+ model=self.embedding_model_name,
153
+ input=batch
154
+ )
155
+
156
+ # Extract embedding vectors
157
+ batch_embeddings = [data.embedding for data in response.data]
158
+ all_embeddings.extend(batch_embeddings)
159
+
160
+ # Show progress
161
+ progress = min(i + batch_size, len(texts))
162
+ print(f" ✅ Processed {progress}/{len(texts)} texts")
163
+ break
164
+
165
+ except Exception as e:
166
+ retries -= 1
167
+ if retries > 0:
168
+ print(f" ⚠️ Retry {3-retries}/3 for batch {i//batch_size + 1}: {str(e)}")
169
+ time.sleep(2) # Wait before retry
170
+ else:
171
+ print(f" ❌ Failed batch {i//batch_size + 1}: {str(e)}")
172
+ # Return zero vectors for failed batch
173
+ all_embeddings.extend([[0.0] * self.embedding_size for _ in batch])
174
+
175
+ return all_embeddings
176
+
177
+ def store_document(self, doc_id: str, chunks: List[Dict[str, Any]]):
178
+ """Store document chunks in Qdrant with embeddings"""
179
+ print(f"🔄 Processing {len(chunks)} chunks...")
180
+
181
+ # Check if chunks already exist for this document
182
+ try:
183
+ existing = self.qdrant_client.scroll(
184
+ collection_name=self.collection_name,
185
+ scroll_filter=Filter(
186
+ must=[FieldCondition(key="doc_id", match=MatchValue(value=doc_id))]
187
+ ),
188
+ limit=1
189
+ )
190
+ if existing[0]:
191
+ print(f"⚠️ Document {doc_id} already exists. Skipping...")
192
+ return
193
+ except:
194
+ pass # Collection might be empty
195
+
196
+ print(f"🧠 Creating embeddings for {len(chunks)} chunks...")
197
+ texts = [chunk['text'] for chunk in chunks]
198
+ embeddings = self.create_embeddings(texts)
199
+
200
+ print(f"📦 Preparing vectors for storage...")
201
+ points = []
202
+ upload_time = datetime.now(timezone.utc).isoformat()
203
+
204
+ for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
205
+ # Generate a proper UUID for each point
206
+ point_id = str(uuid.uuid4())
207
+ point = PointStruct(
208
+ id=point_id,
209
+ vector=embedding,
210
+ payload={
211
+ "doc_id": doc_id,
212
+ "chunk_id": i,
213
+ "text": chunk['text'],
214
+ "page": chunk['page'],
215
+ "section": chunk.get('section', 'Unknown'),
216
+ "upload_time": upload_time
217
+ }
218
+ )
219
+ points.append(point)
220
+
221
+ # Store in batches with retry and adaptive downsizing on timeout
222
+ default_batch_size = int(os.environ.get('QDRANT_UPSERT_BATCH', '32'))
223
+ i = 0
224
+ batch_index = 0
225
+ while i < len(points):
226
+ batch_size = min(default_batch_size, len(points) - i)
227
+ batch = points[i:i + batch_size]
228
+ attempts = 0
229
+ while attempts < 3:
230
+ try:
231
+ self.qdrant_client.upsert(
232
+ collection_name=self.collection_name,
233
+ points=batch,
234
+ )
235
+ batch_index += 1
236
+ print(f"📦 Stored batch {batch_index}/{(len(points)+default_batch_size-1)//default_batch_size} ({len(batch)} points)")
237
+ i += batch_size
238
+ break
239
+ except Exception as e:
240
+ attempts += 1
241
+ if 'Timeout' in str(e) or 'timed out' in str(e):
242
+ # Halve the batch size and retry
243
+ new_size = max(5, batch_size // 2)
244
+ print(f"⚠️ Upsert timeout. Reducing batch from {batch_size} to {new_size} and retrying ({attempts}/3)...")
245
+ batch_size = new_size
246
+ batch = points[i:i + batch_size]
247
+ time.sleep(1.0)
248
+ continue
249
+ else:
250
+ print(f"❌ Upsert error on batch starting at {i}: {e}")
251
+ raise
252
+
253
+ print(f"✅ Stored {len(chunks)} chunks in Qdrant")
254
+
255
+ def get_all_documents(self) -> List[Dict[str, Any]]:
256
+ """Retrieve all unique documents from Qdrant with metadata"""
257
+ try:
258
+ print("🔄 Fetching all documents from Qdrant...")
259
+
260
+ # Use scroll to get all points
261
+ all_points = []
262
+ offset = None
263
+ limit = 100
264
+
265
+ while True:
266
+ records, next_offset = self.qdrant_client.scroll(
267
+ collection_name=self.collection_name,
268
+ limit=limit,
269
+ offset=offset,
270
+ with_payload=True,
271
+ with_vectors=False
272
+ )
273
+
274
+ all_points.extend(records)
275
+
276
+ if next_offset is None:
277
+ break
278
+
279
+ offset = next_offset
280
+
281
+ # Group by doc_id to get unique documents
282
+ documents = {}
283
+ for point in all_points:
284
+ doc_id = point.payload.get('doc_id')
285
+ if doc_id and doc_id not in documents:
286
+ # Initialize document info
287
+ documents[doc_id] = {
288
+ 'doc_id': doc_id,
289
+ 'title': doc_id.replace('_', ' ').replace('.pdf', ''),
290
+ 'chunks': 0,
291
+ 'pages': set(),
292
+ 'upload_time': point.payload.get('upload_time', 'Unknown')
293
+ }
294
+
295
+ if doc_id:
296
+ # Update chunk count and pages
297
+ documents[doc_id]['chunks'] += 1
298
+ page = point.payload.get('page', 0)
299
+ if page:
300
+ documents[doc_id]['pages'].add(page)
301
+
302
+ # Convert to list and finalize
303
+ result = []
304
+ for doc_id, doc_info in documents.items():
305
+ doc_info['pages'] = len(doc_info['pages']) # Convert set to count
306
+ result.append(doc_info)
307
+
308
+ # Sort by upload time (newest first)
309
+ result.sort(key=lambda x: x.get('upload_time', ''), reverse=True)
310
+
311
+ print(f"✅ Found {len(result)} documents in Qdrant")
312
+ return result
313
+
314
+ except Exception as e:
315
+ print(f"❌ Error retrieving documents: {e}")
316
+ return []
317
+
318
+ def delete_document(self, doc_id: str) -> bool:
319
+ """Delete all chunks for a specific document"""
320
+ try:
321
+ print(f"🗑️ Deleting document {doc_id}...")
322
+
323
+ self.qdrant_client.delete(
324
+ collection_name=self.collection_name,
325
+ points_selector=Filter(
326
+ must=[FieldCondition(key="doc_id", match=MatchValue(value=doc_id))]
327
+ )
328
+ )
329
+
330
+ print(f"✅ Deleted document {doc_id}")
331
+ return True
332
+
333
+ except Exception as e:
334
+ print(f"❌ Error deleting document: {e}")
335
+ return False
336
+
337
+ def search(self, query: str, doc_id: str, top_k: int = 10) -> List[Dict[str, Any]]:
338
+ """Search for relevant chunks using vector similarity with improved retrieval"""
339
+ print(f"🔍 Searching for: '{query}'")
340
+
341
+ # Expand query for better medical term matching
342
+ expanded_query = self.expand_query(query)
343
+ print(f"🔍 Expanded query: '{expanded_query}'")
344
+
345
+ # Primary search with expanded query
346
+ results = self._perform_search(expanded_query, doc_id, top_k)
347
+
348
+ # If no good results, try fallback searches
349
+ if not results or len([r for r in results if r['score'] > 0.15]) == 0:
350
+ print("🔍 Trying fallback search with key terms...")
351
+ # Extract key medical terms for fallback search
352
+ key_terms = self._extract_key_terms(query)
353
+ for term in key_terms:
354
+ fallback_results = self._perform_search(term, doc_id, top_k//2)
355
+ results.extend(fallback_results)
356
+
357
+ # Remove duplicates and sort by score
358
+ seen_chunks = set()
359
+ unique_results = []
360
+ for result in results:
361
+ chunk_key = f"{result['chunk_id']}_{result['page']}"
362
+ if chunk_key not in seen_chunks:
363
+ seen_chunks.add(chunk_key)
364
+ unique_results.append(result)
365
+
366
+ # Sort by score descending
367
+ unique_results.sort(key=lambda x: x['score'], reverse=True)
368
+
369
+ # Filter results with minimum relevance score - very lenient threshold
370
+ filtered_results = [r for r in unique_results if r['score'] > 0.10]
371
+ print(f"📊 Found {len(filtered_results)} relevant chunks (score > 0.10)")
372
+
373
+ # If still no results, return top 5 results anyway for fallback
374
+ if not filtered_results and unique_results:
375
+ filtered_results = unique_results[:5]
376
+ print(f"📊 No high-relevance chunks found, using top {len(filtered_results)} results as fallback")
377
+
378
+ return filtered_results[:top_k]
379
+
380
+ def _perform_search(self, query: str, doc_id: str, limit: int) -> List[Dict[str, Any]]:
381
+ """Perform a single search operation"""
382
+ query_embedding = self.create_embeddings([query])[0]
383
+
384
+ # If doc_id is 'any' or we want to search all documents, don't filter
385
+ if doc_id == 'any':
386
+ search_results = self.qdrant_client.query_points(
387
+ collection_name=self.collection_name,
388
+ query=query_embedding,
389
+ limit=limit,
390
+ with_payload=True
391
+ )
392
+ else:
393
+ # Filter strictly by the provided doc_id; fallback to no filter on error
394
+ try:
395
+ search_results = self.qdrant_client.query_points(
396
+ collection_name=self.collection_name,
397
+ query=query_embedding,
398
+ query_filter=Filter(
399
+ must=[FieldCondition(key="doc_id", match=MatchValue(value=doc_id))]
400
+ ),
401
+ limit=limit,
402
+ with_payload=True
403
+ )
404
+ except Exception:
405
+ search_results = self.qdrant_client.query_points(
406
+ collection_name=self.collection_name,
407
+ query=query_embedding,
408
+ limit=limit,
409
+ with_payload=True
410
+ )
411
+
412
+ results = []
413
+ for result in search_results.points:
414
+ results.append({
415
+ "text": result.payload["text"],
416
+ "page": result.payload["page"],
417
+ "section": result.payload["section"],
418
+ "score": float(result.score),
419
+ "chunk_id": result.payload["chunk_id"],
420
+ "doc_id": result.payload.get("doc_id", "unknown")
421
+ })
422
+ return results
423
+
424
+ def _extract_key_terms(self, query: str) -> List[str]:
425
+ """Extract key medical terms from query for fallback search"""
426
+ # Extract important terms
427
+ terms = []
428
+
429
+ # Medical abbreviations and key terms
430
+ medical_terms = ["acidosis", "RTA", "anion gap", "metabolic", "urine pH", "differential", "MUDPILES", "GOLDMARK"]
431
+
432
+ query_lower = query.lower()
433
+ for term in medical_terms:
434
+ if term.lower() in query_lower:
435
+ terms.append(term)
436
+
437
+ return terms[:3] # Return top 3 terms
438
+
439
+ def expand_query(self, query: str) -> str:
440
+ """Expand query with synonyms and related terms for better search"""
441
+ # Common medical and general expansions
442
+ expansions = {
443
+ "fuo": "fever unknown origin fever of unknown origin pyrexia unexplained fever",
444
+ "classic": "classical traditional standard typical",
445
+ "nosocomial": "hospital acquired healthcare associated hospital-acquired",
446
+ "neutropenic": "neutropenia immunocompromised low neutrophil count",
447
+ "hiv": "human immunodeficiency virus AIDS HIV-associated",
448
+ "diagnostic": "diagnosis workup evaluation investigation",
449
+ "pet/ct": "PET-CT positron emission tomography computed tomography PET scan",
450
+ "pet": "positron emission tomography PET scan PET-CT",
451
+ "workup": "work up evaluation investigation diagnostic approach",
452
+ "first-line": "initial primary first line baseline",
453
+ "imaging": "radiologic radiology scan imaging studies",
454
+ "labs": "laboratory tests blood work investigations",
455
+ "categories": "types classifications groups subtypes",
456
+ "major": "main primary principal important key"
457
+ }
458
+
459
+ expanded = query.lower()
460
+ for term, expansion in expansions.items():
461
+ if term.lower() in expanded:
462
+ expanded = expanded.replace(term.lower(), f"{term.lower()} {expansion}")
463
+
464
+ return expanded
465
+
466
+ def generate_answer(self, query: str, context_chunks: List[Dict[str, Any]]) -> str:
467
+ """Generate answer using OpenAI GPT-4o-mini with improved context"""
468
+ print(f"🧠 generate_answer called with {len(context_chunks)} chunks")
469
+
470
+ if not self.openai_client:
471
+ print("❌ OpenAI client not initialized")
472
+ return "OpenAI client not initialized. Please check your API key."
473
+
474
+ if not context_chunks:
475
+ print("❌ No context chunks provided")
476
+ return "I couldn't find any relevant information in the document to answer your question."
477
+
478
+ # Use fewer but more relevant chunks with size limit
479
+ relevant_chunks = [chunk for chunk in context_chunks if chunk['score'] > 0.3][:5]
480
+ if not relevant_chunks:
481
+ relevant_chunks = context_chunks[:3] # Fallback to top 3
482
+
483
+ context_parts = []
484
+ total_length = 0
485
+ max_context_length = 8000 # Limit context to 8K characters
486
+
487
+ # Derive source names from doc_id (strip trailing timestamp if present)
488
+ source_names = []
489
+ seen_sources = set()
490
+ for chunk in relevant_chunks:
491
+ doc_id = chunk.get('doc_id', 'unknown')
492
+ base = doc_id.rsplit('_', 1)[0] if '_' in doc_id else doc_id
493
+ if base and base not in seen_sources:
494
+ seen_sources.add(base)
495
+ source_names.append(base)
496
+
497
+ for chunk in relevant_chunks:
498
+ chunk_text = f"[Page {chunk['page']}, Score: {chunk['score']:.3f}] {chunk['text'][:1000]}..."
499
+ if total_length + len(chunk_text) > max_context_length:
500
+ break
501
+ context_parts.append(chunk_text)
502
+ total_length += len(chunk_text)
503
+
504
+ # Prepend sources and page summary to aid citations
505
+ sources_header = "; ".join(source_names) if source_names else "Unknown source"
506
+ page_list = sorted({c['page'] for c in relevant_chunks if 'page' in c})
507
+ pages_summary = ", ".join(str(p) for p in page_list)
508
+ context = (
509
+ f"Sources: {sources_header}\n"
510
+ f"Pages in retrieved context: {pages_summary}\n\n"
511
+ + "\n\n".join(context_parts)
512
+ )
513
+
514
+ print(f"📄 Context length: {len(context)} characters")
515
+ print(f"🔍 First chunk preview: {context_chunks[0]['text'][:100]}...")
516
+
517
+ # New system prompt (per user specification) + user content with context
518
+ system_prompt = (
519
+ "# Role and Objective\n"
520
+ "You are a senior medical tutor specializing in preparing students for Indian medical entrance exams (NEET-PG, INI-CET, FMGE).\n"
521
+ "# Instructions\n"
522
+ "- Always answer strictly based on information from standard textbooks (e.g., Harrison, Robbins, Bailey & Love, DC Dutta, Shaw, Park, Ganong, Guyton).\n"
523
+ "- If there is insufficient data available in these textbooks, respond: “Insufficient evidence from standard textbooks.”\n"
524
+ "- Do not fabricate or introduce non-standard material into your answers.\n"
525
+ "- Begin with a concise checklist (3-5 bullets) outlining the conceptual steps you will use to construct your answer (e.g., identify relevant information, reference textbooks, analyze options, format answer, cite sources).\n"
526
+ "## Output Format\n"
527
+ "- **Explanation:**\n"
528
+ "- Start with why the correct answer fits, using textbook references to support your explanation.\n"
529
+ "- **Why other options are wrong:**\n"
530
+ "- Briefly rule out each incorrect choice with textbook-based reasoning.\n"
531
+ "- **Clinical Pearl:**\n"
532
+ "- Highlight clinical pearls (e.g., “physiologic leucorrhea never causes pruritus,” “most common site of endometriosis = ovary”) as appropriate.\n"
533
+ "- **References:**\n"
534
+ "- Cite the textbook name, edition, and page number (if available). Place this section at the end of the answer, after all explanations and pearls.\n"
535
+ "- Keep explanations exam-friendly, high-yield, and structured (use short paragraphs or bullet points).\n"
536
+ "- If an image is provided, integrate it naturally into the reasoning but do not describe the image explicitly—only use it as a supportive clue.\n"
537
+ "- Keep answers concise but concept-rich, resembling a mini textbook explanation rather than a long essay.\n"
538
+ "## Reasoning Effort & Validation\n"
539
+ "- Set reasoning_effort=medium to ensure thorough but efficient explanations appropriate for exam-level concepts.\n"
540
+ "- After drafting the response, quickly validate whether all parts are completed as per the Output Format; if any part is missing or insufficiently referenced, self-correct before finalizing the answer."
541
+ )
542
+
543
+ user_content = (
544
+ f"Document Context (textbook excerpts):\n{context}\n\n"
545
+ f"Question: {query}\n\n"
546
+ "Use only the provided excerpts. When citing, include textbook name and exact page from the pages listed above."
547
+ )
548
+
549
+ try:
550
+ print("🔄 Making OpenAI API call...")
551
+ params = {
552
+ "model": self.chat_model_name,
553
+ "messages": [
554
+ {"role": "system", "content": system_prompt},
555
+ {"role": "user", "content": user_content},
556
+ ],
557
+ }
558
+ # gpt-5 models expect 'max_completion_tokens'; older models use 'max_tokens'
559
+ if str(self.chat_model_name).startswith("gpt-5"):
560
+ params["max_completion_tokens"] = 1500
561
+ else:
562
+ params["max_tokens"] = 1500
563
+ params["temperature"] = 0.0
564
+
565
+ response = self.openai_client.chat.completions.create(**params)
566
+
567
+ # Try to extract text safely
568
+ text = ""
569
+ try:
570
+ text = (response.choices[0].message.content or "").strip()
571
+ except Exception:
572
+ text = ""
573
+
574
+ # Fallback to Responses API when empty
575
+ if not text:
576
+ try:
577
+ combined_input = system_prompt + "\n\n" + user_content
578
+ resp2 = self.openai_client.responses.create(
579
+ model=self.chat_model_name,
580
+ input=combined_input,
581
+ max_output_tokens=1500 if str(self.chat_model_name).startswith("gpt-5") else None,
582
+ )
583
+ if hasattr(resp2, "output_text") and resp2.output_text:
584
+ text = resp2.output_text.strip()
585
+ elif hasattr(resp2, "choices") and resp2.choices:
586
+ m = getattr(resp2.choices[0], "message", None)
587
+ if m and getattr(m, "content", None):
588
+ text = m.content.strip()
589
+ except Exception as e2:
590
+ print(f"⚠️ Responses API fallback error: {e2}")
591
+
592
+ if not text:
593
+ raise RuntimeError("Empty response content from model")
594
+
595
+ print(f"✅ OpenAI response received: {len(text)} characters")
596
+ print(f"📝 Answer preview: {text[:100]}...")
597
+ return text
598
+
599
+ except Exception as e:
600
+ print(f"❌ OpenAI API error: {e}")
601
+ error_message = f"I found relevant information but couldn't generate a proper response due to an API error: {str(e)}"
602
+ if context_chunks:
603
+ error_message += f"\n\nHere's what I found: {context_chunks[0]['text'][:300]}... [Page {context_chunks[0]['page']}]"
604
+ return error_message
605
+
606
+
607
+ def extract_pdf_pages(pdf_path: str) -> List[str]:
608
+ """Extract text from PDF pages"""
609
+ try:
610
+ import pypdf
611
+ reader = pypdf.PdfReader(pdf_path)
612
+ pages = []
613
+ for page in reader.pages:
614
+ try:
615
+ text = page.extract_text() or ""
616
+ text = text.strip()
617
+ if text:
618
+ pages.append(text)
619
+ except:
620
+ continue
621
+ return pages
622
+ except Exception as e:
623
+ print(f"PDF extraction error: {e}")
624
+ return []
625
+
626
+
627
+ def create_chunks(pages: List[str], chunk_size: int = 3000, overlap: int = 500) -> List[Dict[str, Any]]:
628
+ """Create overlapping chunks from pages with optimized sizing"""
629
+ chunks = []
630
+
631
+ print(f"📄 Processing {len(pages)} pages into chunks...")
632
+
633
+ for page_num, page_text in enumerate(pages, 1):
634
+ if len(page_text) < 100: # Skip very short pages
635
+ continue
636
+
637
+ # For very long pages, split into smaller sections
638
+ if len(page_text) > chunk_size * 2:
639
+ # Split by paragraphs (double newline)
640
+ paragraphs = page_text.split('\n\n')
641
+ current_chunk = ""
642
+
643
+ for para in paragraphs:
644
+ para = para.strip()
645
+ if not para:
646
+ continue
647
+
648
+ # If adding this paragraph exceeds chunk size, save current chunk
649
+ if len(current_chunk) + len(para) > chunk_size and current_chunk:
650
+ chunk_text = current_chunk.strip()
651
+ if len(chunk_text) > 200: # Only save substantial chunks
652
+ chunks.append({
653
+ "text": chunk_text,
654
+ "page": page_num,
655
+ "section": f"Page {page_num}"
656
+ })
657
+
658
+ # Keep last part for context
659
+ words = current_chunk.split()
660
+ if len(words) > 100:
661
+ overlap_text = ' '.join(words[-100:])
662
+ current_chunk = overlap_text + "\n\n" + para
663
+ else:
664
+ current_chunk = para
665
+ else:
666
+ current_chunk += "\n\n" + para if current_chunk else para
667
+
668
+ # Add remaining content
669
+ if current_chunk.strip() and len(current_chunk.strip()) > 200:
670
+ chunks.append({
671
+ "text": current_chunk.strip(),
672
+ "page": page_num,
673
+ "section": f"Page {page_num}"
674
+ })
675
+ else:
676
+ # For shorter pages, add the whole page as one chunk
677
+ if len(page_text.strip()) > 200:
678
+ chunks.append({
679
+ "text": page_text.strip(),
680
+ "page": page_num,
681
+ "section": f"Page {page_num}"
682
+ })
683
+
684
+ print(f"✅ Created {len(chunks)} chunks from {len(pages)} pages")
685
+ return chunks
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit>=1.28.0
2
+ pypdf>=4.2.0
3
+ qdrant-client>=1.7.0
4
+ openai>=1.0.0
5
+ python-dotenv>=1.0.0
6
+ PyMuPDF>=1.23.0
7
+ Pillow>=10.0.0
scripts/push_to_hf.sh ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # Simple deploy helper to push the current repo to a Hugging Face Space.
5
+ # Usage:
6
+ # HF_USER=athulnambiar HF_SPACE=pyqsprag HF_TOKEN=hf_xxx ./scripts/push_to_hf.sh
7
+ # Optional:
8
+ # BRANCH=main QDRYRUN=1
9
+
10
+ HF_USER=${HF_USER:-}
11
+ HF_SPACE=${HF_SPACE:-}
12
+ HF_TOKEN=${HF_TOKEN:-}
13
+ BRANCH=${BRANCH:-main}
14
+
15
+ if [[ -z "${HF_USER}" || -z "${HF_SPACE}" || -z "${HF_TOKEN}" ]]; then
16
+ echo "ERROR: Please set HF_USER, HF_SPACE, and HF_TOKEN environment variables." >&2
17
+ echo "Example: HF_USER=athulnambiar HF_SPACE=pyqsprag HF_TOKEN=hf_*** ./scripts/push_to_hf.sh" >&2
18
+ exit 1
19
+ fi
20
+
21
+ repo_url="https://${HF_USER}:${HF_TOKEN}@huggingface.co/spaces/${HF_USER}/${HF_SPACE}"
22
+
23
+ workdir=$(mktemp -d)
24
+ echo "Cloning Space into: ${workdir}" >&2
25
+ git clone "${repo_url}" "${workdir}/space"
26
+
27
+ # Ensure branch exists and is checked out
28
+ pushd "${workdir}/space" >/dev/null
29
+ git checkout -B "${BRANCH}"
30
+ popd >/dev/null
31
+
32
+ # Sync files from current repo into the Space, excluding local/dev artifacts
33
+ rsync -av --delete \
34
+ --exclude ".git" \
35
+ --exclude ".venv" \
36
+ --exclude "__pycache__/" \
37
+ --exclude "qdrant_storage/" \
38
+ --exclude "uploads/" \
39
+ --exclude ".env" \
40
+ --exclude ".claude/" \
41
+ --exclude ".streamlit/secrets.toml" \
42
+ ./ "${workdir}/space/"
43
+
44
+ pushd "${workdir}/space" >/dev/null
45
+
46
+ # Make sure Streamlit apps deploy on Spaces
47
+ if [[ ! -f requirements.txt ]]; then
48
+ echo "streamlit>=1.28.0" > requirements.txt
49
+ fi
50
+
51
+ # Commit and push
52
+ git add -A
53
+ if git diff --cached --quiet; then
54
+ echo "No changes to push."
55
+ else
56
+ git commit -m "Deploy from QUADRANT_RAG"
57
+ if [[ "${QDRYRUN:-}" == "1" ]]; then
58
+ echo "Dry run enabled; skipping push." >&2
59
+ else
60
+ git push -u origin "${BRANCH}"
61
+ fi
62
+ fi
63
+
64
+ echo "Deploy complete. Space: https://huggingface.co/spaces/${HF_USER}/${HF_SPACE}" >&2
65
+