imurra commited on
Commit
1d7f6cb
·
verified ·
1 Parent(s): f33d22c

made change to give complete exemplars, removed AI answer explanaiton creation as done in GPT BOT post vector databse retreival
app.py: Remove OpenAI, just return structured exemplar data (I'll code this separately)
knowledge.json: Add explanation_generation section (above)

Files changed (1) hide show
  1. app.py +150 -189
app.py CHANGED
@@ -8,8 +8,6 @@ import gradio as gr
8
  from fastapi import FastAPI
9
  from pydantic import BaseModel
10
  import re
11
- import anthropic # You'll need: pip install anthropic
12
- # OR if using OpenAI: import openai
13
 
14
  # Extract and load database
15
  DB_PATH = "./medqa_db"
@@ -28,17 +26,15 @@ print("🧠 Loading MedCPT model...")
28
  model = SentenceTransformer('ncbi/MedCPT-Query-Encoder')
29
  print("✅ Model ready")
30
 
31
- # Initialize AI client (choose one)
32
- # Option 1: Claude
33
- claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
34
-
35
- # Option 2: OpenAI (uncomment if using)
36
- # openai.api_key = os.environ.get("OPENAI_API_KEY")
37
-
38
  # ============================================================================
39
- # Deduplication function (same as before)
40
  # ============================================================================
41
  def deduplicate_results(results, target_count):
 
 
 
 
 
42
  if not results['documents'][0]:
43
  return results
44
 
@@ -78,7 +74,7 @@ def deduplicate_results(results, target_count):
78
  }
79
 
80
  # ============================================================================
81
- # Search function (same as before)
82
  # ============================================================================
83
  def search(query, num_results=3, source_filter=None):
84
  emb = model.encode(query).tolist()
@@ -98,10 +94,10 @@ def search(query, num_results=3, source_filter=None):
98
  return deduplicate_results(results, num_results)
99
 
100
  # ============================================================================
101
- # NEW: Parser to extract question structure
102
  # ============================================================================
103
  def parse_question_document(doc_text, metadata):
104
- """Extract question and choices from document text."""
105
 
106
  lines = doc_text.split('\n')
107
  question_lines = []
@@ -113,6 +109,7 @@ def parse_question_document(doc_text, metadata):
113
  if not line:
114
  continue
115
 
 
116
  option_match = re.match(r'^([A-E])[\.\)]\s*(.+)$', line)
117
 
118
  if option_match:
@@ -123,160 +120,149 @@ def parse_question_document(doc_text, metadata):
123
  elif not options_started:
124
  question_lines.append(line)
125
 
 
126
  question_text = ' '.join(question_lines).strip()
127
 
128
  answer_idx = metadata.get('answer_idx', 'N/A')
 
 
 
 
 
129
 
130
  return {
131
  'question': question_text,
132
  'choices': options,
133
- 'correct_answer': answer_idx
 
134
  }
135
 
136
  # ============================================================================
137
- # NEW: AI generation functions
138
  # ============================================================================
139
- def generate_choice_explanations(question, choices, correct_answer):
140
- """Generate explanations for why each choice is correct/wrong."""
 
141
 
142
- choices_text = '\n'.join([f"{k}. {v}" for k, v in choices.items()])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
- prompt = f"""You are a medical educator. For this USMLE-style question, explain why EACH answer choice is correct or incorrect.
145
-
146
- QUESTION:
147
- {question}
148
-
149
- ANSWER CHOICES:
150
- {choices_text}
151
-
152
- CORRECT ANSWER: {correct_answer}
153
 
154
- Provide a 1-2 sentence explanation for EACH choice (A through E) explaining why it is correct or incorrect. Format as:
155
-
156
- A. [Choice text] - [Explanation]
157
- B. [Choice text] - [Explanation]
158
- C. [Choice text] - [Explanation]
159
- D. [Choice text] - [Explanation]
160
- E. [Choice text] - [Explanation]"""
161
-
162
- # Using Claude
163
- message = claude_client.messages.create(
164
- model="claude-sonnet-4-20250514",
165
- max_tokens=1000,
166
- messages=[{"role": "user", "content": prompt}]
167
- )
168
 
169
- return message.content[0].text
 
170
 
171
- # OR using OpenAI (uncomment if using):
172
- # response = openai.ChatCompletion.create(
173
- # model="gpt-4",
174
- # messages=[{"role": "user", "content": prompt}],
175
- # max_tokens=1000
176
- # )
177
- # return response.choices[0].message.content
178
-
179
- def generate_similar_question(original_question, choices, correct_answer):
180
- """Generate a new question based on the exemplar."""
181
 
182
- choices_text = '\n'.join([f"{k}. {v}" for k, v in choices.items()])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
- prompt = f"""You are a medical educator. Based on this USMLE-style question, create a NEW similar question that tests the SAME medical concept but with a different clinical scenario.
185
-
186
- ORIGINAL QUESTION:
187
- {question}
188
-
189
- ANSWER CHOICES:
190
- {choices_text}
191
-
192
- CORRECT ANSWER: {correct_answer}
193
-
194
- Create a NEW question that:
195
- 1. Tests the same medical concept
196
- 2. Uses a different patient scenario
197
- 3. Has 5 answer choices (A-E)
198
- 4. Includes explanations for why each choice is correct/incorrect
199
-
200
- Format your response EXACTLY as:
201
-
202
- NEW QUESTION:
203
- [Your new question text]
204
-
205
- ANSWER CHOICES:
206
- A. [Choice A]
207
- B. [Choice B]
208
- C. [Choice C]
209
- D. [Choice D]
210
- E. [Choice E]
211
-
212
- CORRECT ANSWER: [Letter]
213
-
214
- EXPLANATIONS:
215
- A. [Choice A text] - [Explanation]
216
- B. [Choice B text] - [Explanation]
217
- C. [Choice C text] - [Explanation]
218
- D. [Choice D text] - [Explanation]
219
- E. [Choice E text] - [Explanation]"""
220
-
221
- # Using Claude
222
- message = claude_client.messages.create(
223
- model="claude-sonnet-4-20250514",
224
- max_tokens=2000,
225
- messages=[{"role": "user", "content": prompt}]
226
- )
227
 
228
- return message.content[0].text
229
 
230
- # OR using OpenAI:
231
- # response = openai.ChatCompletion.create(
232
- # model="gpt-4",
233
- # messages=[{"role": "user", "content": prompt}],
234
- # max_tokens=2000
235
- # )
236
- # return response.choices[0].message.content
237
-
238
- # ============================================================================
239
- # NEW: Format complete output
240
- # ============================================================================
241
- def format_complete_output(exemplar_num, parsed, original_explanation, choice_explanations, new_question_text):
242
- """Format everything into readable plain text."""
243
 
244
- choices_text = '\n'.join([f"{k}. {v}" for k, v in parsed['choices'].items()])
 
 
 
 
245
 
246
- output = f"""{'='*80}
247
- EXEMPLAR {exemplar_num}
248
- {'='*80}
249
-
250
- ORIGINAL QUESTION:
251
- {parsed['question']}
252
-
253
- ANSWER CHOICES:
254
- {choices_text}
255
-
256
- CORRECT ANSWER: {parsed['correct_answer']}
257
-
258
- EXPLANATION FOR EACH CHOICE:
259
- {choice_explanations}
260
- """
261
 
262
- if original_explanation:
263
- output += f"\nORIGINAL EXPLANATION FROM DATABASE:\n{original_explanation}\n"
264
 
265
- output += f"""
266
- {'-'*80}
267
- AI-GENERATED SIMILAR QUESTION:
268
- {'-'*80}
269
-
270
- {new_question_text}
271
-
272
- {'='*80}
273
-
274
- """
275
 
276
- return output
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  # ============================================================================
279
- # MODIFIED: API endpoint with full generation
280
  # ============================================================================
281
  app = FastAPI()
282
 
@@ -284,72 +270,47 @@ class SearchRequest(BaseModel):
284
  query: str
285
  num_results: int = 3
286
  source_filter: str = None
287
- generate_ai: bool = True # Option to skip AI generation for faster response
288
 
289
  @app.post("/search_medqa")
290
  def api_search(req: SearchRequest):
291
- """Search and return complete formatted exemplars with AI-generated content."""
 
 
 
292
 
293
- print(f"🔍 Searching for: {req.query}")
294
  r = search(req.query, req.num_results, req.source_filter)
295
 
296
  if not r['documents'][0]:
297
- return {"output": "No results found."}
298
-
299
- complete_output = f"SEARCH QUERY: {req.query}\n"
300
- complete_output += f"FOUND {len(r['documents'][0])} EXEMPLARS\n\n"
301
 
 
302
  for i in range(len(r['documents'][0])):
303
- print(f"Processing exemplar {i+1}...")
304
-
305
  doc_text = r['documents'][0][i]
306
  metadata = r['metadatas'][0][i]
307
 
308
- # Parse the exemplar
309
  parsed = parse_question_document(doc_text, metadata)
310
- original_explanation = metadata.get('explanation', '')
311
 
312
- if req.generate_ai:
313
- # Generate AI content
314
- print(f" Generating choice explanations...")
315
- choice_explanations = generate_choice_explanations(
316
- parsed['question'],
317
- parsed['choices'],
318
- parsed['correct_answer']
319
- )
320
-
321
- print(f" Generating similar question...")
322
- new_question = generate_similar_question(
323
- parsed['question'],
324
- parsed['choices'],
325
- parsed['correct_answer']
326
- )
327
- else:
328
- choice_explanations = "(AI generation skipped)"
329
- new_question = "(AI generation skipped)"
330
-
331
- # Format complete output
332
- formatted = format_complete_output(
333
- i + 1,
334
- parsed,
335
- original_explanation,
336
- choice_explanations,
337
- new_question
338
- )
339
 
340
- complete_output += formatted
341
 
342
- return {
343
- "output": complete_output,
344
- "content_type": "text/plain"
345
- }
346
 
347
- # Gradio UI (simplified - just shows we have it)
348
- with gr.Blocks(theme=gr.themes.Soft(), title="MedQA Search") as demo:
349
- gr.Markdown("# 🏥 MedQA Search with AI Generation")
350
- query_input = gr.Textbox(label="Query")
351
- output = gr.Textbox(label="Results", lines=50)
352
-
353
  app = gr.mount_gradio_app(app, demo, path="/")
354
 
355
  if __name__ == "__main__":
 
8
  from fastapi import FastAPI
9
  from pydantic import BaseModel
10
  import re
 
 
11
 
12
  # Extract and load database
13
  DB_PATH = "./medqa_db"
 
26
  model = SentenceTransformer('ncbi/MedCPT-Query-Encoder')
27
  print("✅ Model ready")
28
 
 
 
 
 
 
 
 
29
  # ============================================================================
30
+ # Deduplication function
31
  # ============================================================================
32
  def deduplicate_results(results, target_count):
33
+ """
34
+ Remove duplicate questions based on:
35
+ 1. High text similarity (>0.92) - catches near-exact duplicates
36
+ 2. Same answer + moderate similarity (>0.85) - catches conceptual duplicates
37
+ """
38
  if not results['documents'][0]:
39
  return results
40
 
 
74
  }
75
 
76
  # ============================================================================
77
+ # Search function with deduplication
78
  # ============================================================================
79
  def search(query, num_results=3, source_filter=None):
80
  emb = model.encode(query).tolist()
 
94
  return deduplicate_results(results, num_results)
95
 
96
  # ============================================================================
97
+ # Parser to extract question structure
98
  # ============================================================================
99
  def parse_question_document(doc_text, metadata):
100
+ """Extract question and choices from document text - NO TRUNCATION."""
101
 
102
  lines = doc_text.split('\n')
103
  question_lines = []
 
109
  if not line:
110
  continue
111
 
112
+ # Check if this is an option line (A., B., C., etc.)
113
  option_match = re.match(r'^([A-E])[\.\)]\s*(.+)$', line)
114
 
115
  if option_match:
 
120
  elif not options_started:
121
  question_lines.append(line)
122
 
123
+ # Reconstruct FULL question text - no truncation
124
  question_text = ' '.join(question_lines).strip()
125
 
126
  answer_idx = metadata.get('answer_idx', 'N/A')
127
+ answer_text = metadata.get('answer', 'N/A')
128
+
129
+ # If answer_text is just the letter, map it to the actual option text
130
+ if answer_text in options:
131
+ answer_text = options[answer_text]
132
 
133
  return {
134
  'question': question_text,
135
  'choices': options,
136
+ 'correct_answer_letter': answer_idx,
137
+ 'correct_answer_text': answer_text
138
  }
139
 
140
  # ============================================================================
141
+ # Enhanced Gradio UI
142
  # ============================================================================
143
+ def ui_search(query, num_results=3, source_filter="all"):
144
+ if not query.strip():
145
+ return "💡 Enter a medical query to search"
146
 
147
+ try:
148
+ r = search(query, num_results, source_filter if source_filter != "all" else None)
149
+
150
+ if not r['documents'][0]:
151
+ return "❌ No results found"
152
+
153
+ out = f"🔍 Found {len(r['documents'][0])} unique results\n\n"
154
+
155
+ for i in range(len(r['documents'][0])):
156
+ source = r['metadatas'][0][i].get('source', 'unknown')
157
+ distance = r['distances'][0][i]
158
+ similarity = 1 - distance
159
+
160
+ # Source emoji
161
+ if source == 'medgemini':
162
+ source_icon = "🔬"
163
+ source_name = "Med-Gemini"
164
+ elif source.startswith('medqa_'):
165
+ source_icon = "📚"
166
+ split = source.replace('medqa_', '').upper()
167
+ source_name = f"MedQA {split}"
168
+ else:
169
+ source_icon = "📄"
170
+ source_name = source.upper()
171
+
172
+ out += f"\n{'='*70}\n"
173
+ out += f"{source_icon} Result {i+1} | {source_name} | Similarity: {similarity:.3f}\n"
174
+ out += f"{'='*70}\n\n"
175
+ out += r['documents'][0][i]
176
+
177
+ answer = r['metadatas'][0][i].get('answer', 'N/A')
178
+ out += f"\n\n✅ CORRECT ANSWER: {answer}\n"
179
+
180
+ explanation = r['metadatas'][0][i].get('explanation', '')
181
+ if explanation and explanation.strip():
182
+ out += f"\n💡 EXPLANATION:\n{explanation}\n"
183
+
184
+ out += "\n"
185
+
186
+ return out
187
 
188
+ except Exception as e:
189
+ return f"❌ Error: {e}"
 
 
 
 
 
 
 
190
 
191
+ # Create Gradio interface
192
+ with gr.Blocks(theme=gr.themes.Soft(), title="MedQA Search") as demo:
193
+ gr.Markdown("""
194
+ # 🏥 MedQA Semantic Search
 
 
 
 
 
 
 
 
 
 
195
 
196
+ Search across **Med-Gemini** (expert explanations) and **MedQA** (USMLE questions) databases.
197
+ Uses medical-specific embeddings (MedCPT) for accurate retrieval.
198
 
199
+ **Features**: Automatic deduplication, structured output for AI integration
200
+ """)
 
 
 
 
 
 
 
 
201
 
202
+ with gr.Row():
203
+ with gr.Column(scale=3):
204
+ query_input = gr.Textbox(
205
+ label="Medical Query",
206
+ placeholder="e.g., hyponatremia, myocardial infarction, diabetes management...",
207
+ lines=2
208
+ )
209
+ with gr.Column(scale=1):
210
+ num_results = gr.Slider(
211
+ minimum=1,
212
+ maximum=10,
213
+ value=3,
214
+ step=1,
215
+ label="Number of Results"
216
+ )
217
 
218
+ with gr.Row():
219
+ source_filter = gr.Radio(
220
+ choices=["all", "medgemini", "medqa_train", "medqa_dev", "medqa_test"],
221
+ value="all",
222
+ label="Filter by Source"
223
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
+ search_btn = gr.Button("🔍 Search", variant="primary", size="lg")
226
 
227
+ output = gr.Textbox(
228
+ label="Search Results",
229
+ lines=25,
230
+ max_lines=50
231
+ )
 
 
 
 
 
 
 
 
232
 
233
+ search_btn.click(
234
+ fn=ui_search,
235
+ inputs=[query_input, num_results, source_filter],
236
+ outputs=output
237
+ )
238
 
239
+ query_input.submit(
240
+ fn=ui_search,
241
+ inputs=[query_input, num_results, source_filter],
242
+ outputs=output
243
+ )
 
 
 
 
 
 
 
 
 
 
244
 
245
+ gr.Markdown("""
246
+ ### 📊 Database Info
247
 
248
+ **Med-Gemini**: Expert-relabeled questions with detailed explanations
249
+ **MedQA**: USMLE-style questions (Train/Dev/Test splits)
 
 
 
 
 
 
 
 
250
 
251
+ **Total Questions**: ~10,000+ USMLE-style questions
252
+ """)
253
+
254
+ gr.Examples(
255
+ examples=[
256
+ ["hyponatremia", 3, "all"],
257
+ ["myocardial infarction treatment", 2, "medgemini"],
258
+ ["diabetes complications", 3, "all"],
259
+ ["antibiotics for pneumonia", 2, "medqa_train"]
260
+ ],
261
+ inputs=[query_input, num_results, source_filter]
262
+ )
263
 
264
  # ============================================================================
265
+ # FastAPI with structured JSON output (for OpenAI integration)
266
  # ============================================================================
267
  app = FastAPI()
268
 
 
270
  query: str
271
  num_results: int = 3
272
  source_filter: str = None
 
273
 
274
  @app.post("/search_medqa")
275
  def api_search(req: SearchRequest):
276
+ """
277
+ Search MedQA and return structured exemplars.
278
+ Returns COMPLETE question text with no truncation.
279
+ """
280
 
 
281
  r = search(req.query, req.num_results, req.source_filter)
282
 
283
  if not r['documents'][0]:
284
+ return {"results": []}
 
 
 
285
 
286
+ results = []
287
  for i in range(len(r['documents'][0])):
 
 
288
  doc_text = r['documents'][0][i]
289
  metadata = r['metadatas'][0][i]
290
 
291
+ # Parse the document into structured format
292
  parsed = parse_question_document(doc_text, metadata)
 
293
 
294
+ # Build complete result object
295
+ result = {
296
+ "result_number": i + 1,
297
+ "question": parsed['question'], # FULL question text
298
+ "choices": parsed['choices'],
299
+ "correct_answer": parsed['correct_answer_letter'],
300
+ "correct_answer_text": parsed['correct_answer_text'],
301
+ "explanation": metadata.get('explanation', ''),
302
+ "has_explanation": bool(metadata.get('explanation', '').strip()),
303
+ "source": metadata.get('source', 'unknown'),
304
+ "exam_type": metadata.get('exam_type', 'unknown'),
305
+ "split": metadata.get('split', 'unknown'),
306
+ "similarity": round(1 - r['distances'][0][i], 3),
307
+ "metamap_phrases": metadata.get('metamap_phrases', '')
308
+ }
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
+ results.append(result)
311
 
312
+ return {"results": results}
 
 
 
313
 
 
 
 
 
 
 
314
  app = gr.mount_gradio_app(app, demo, path="/")
315
 
316
  if __name__ == "__main__":