imurra commited on
Commit
1d5bf01
Β·
verified Β·
1 Parent(s): 5f6a3ad

modified to deduplicate and overfetch

Browse files

3 additions only:

New deduplicate_results() function (lines 30-72)
Modified search() function (lines 77-94) - now over-fetches and deduplicates
Updated UI text - shows "unique results"

Everything else stays the same.
This will now automatically filter out duplicates before returning results!RetryIM

Files changed (1) hide show
  1. app.py +69 -4
app.py CHANGED
@@ -25,7 +25,64 @@ print("🧠 Loading MedCPT model...")
25
  model = SentenceTransformer('ncbi/MedCPT-Query-Encoder')
26
  print("βœ… Model ready")
27
 
28
- # Search function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def search(query, num_results=3, source_filter=None):
30
  emb = model.encode(query).tolist()
31
 
@@ -34,11 +91,17 @@ def search(query, num_results=3, source_filter=None):
34
  if source_filter and source_filter != "all":
35
  where_clause = {"source": source_filter}
36
 
37
- return collection.query(
 
 
 
38
  query_embeddings=[emb],
39
- n_results=int(num_results),
40
  where=where_clause
41
  )
 
 
 
42
 
43
  # Enhanced Gradio UI
44
  def ui_search(query, num_results=3, source_filter="all"):
@@ -51,7 +114,7 @@ def ui_search(query, num_results=3, source_filter="all"):
51
  if not r['documents'][0]:
52
  return "❌ No results found"
53
 
54
- out = f"πŸ” Found {len(r['documents'][0])} results\n\n"
55
 
56
  for i in range(len(r['documents'][0])):
57
  source = r['metadatas'][0][i].get('source', 'unknown')
@@ -98,6 +161,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="MedQA Search") as demo:
98
 
99
  Search across **Med-Gemini** (expert explanations) and **MedQA** (USMLE questions) databases.
100
  Uses medical-specific embeddings (MedCPT) for accurate retrieval.
 
 
101
  """)
102
 
103
  with gr.Row():
 
25
  model = SentenceTransformer('ncbi/MedCPT-Query-Encoder')
26
  print("βœ… Model ready")
27
 
28
+ # ============================================================================
29
+ # NEW: Deduplication function
30
+ # ============================================================================
31
+ def deduplicate_results(results, target_count):
32
+ """
33
+ Remove duplicate questions based on:
34
+ 1. High text similarity (>0.92) - catches near-exact duplicates
35
+ 2. Same answer + moderate similarity (>0.85) - catches conceptual duplicates
36
+ """
37
+ if not results['documents'][0]:
38
+ return results
39
+
40
+ documents = results['documents'][0]
41
+ metadatas = results['metadatas'][0]
42
+ distances = results['distances'][0]
43
+
44
+ selected_indices = []
45
+
46
+ for i in range(len(documents)):
47
+ is_duplicate = False
48
+ current_answer = metadatas[i].get('answer', '')
49
+
50
+ # Compare to already-selected results
51
+ for j in selected_indices:
52
+ selected_answer = metadatas[j].get('answer', '')
53
+
54
+ # Calculate similarity between questions
55
+ # Lower distance = higher similarity
56
+ dist_diff = abs(distances[i] - distances[j])
57
+
58
+ # Rule 1: Very similar questions (likely exact/near-exact duplicates)
59
+ if dist_diff < 0.08: # Roughly equivalent to >0.92 similarity
60
+ is_duplicate = True
61
+ break
62
+
63
+ # Rule 2: Same answer + similar question (conceptual duplicates)
64
+ if current_answer == selected_answer and dist_diff < 0.15: # ~0.85 similarity
65
+ is_duplicate = True
66
+ break
67
+
68
+ if not is_duplicate:
69
+ selected_indices.append(i)
70
+
71
+ # Stop when we have enough unique results
72
+ if len(selected_indices) >= target_count:
73
+ break
74
+
75
+ # Return filtered results in same format
76
+ return {
77
+ 'documents': [[documents[i] for i in selected_indices]],
78
+ 'metadatas': [[metadatas[i] for i in selected_indices]],
79
+ 'distances': [[distances[i] for i in selected_indices]],
80
+ 'ids': [[results['ids'][0][i] for i in selected_indices]] if 'ids' in results else None
81
+ }
82
+
83
+ # ============================================================================
84
+ # MODIFIED: Search function with deduplication
85
+ # ============================================================================
86
  def search(query, num_results=3, source_filter=None):
87
  emb = model.encode(query).tolist()
88
 
 
91
  if source_filter and source_filter != "all":
92
  where_clause = {"source": source_filter}
93
 
94
+ # Over-fetch to ensure we get enough unique results
95
+ fetch_count = min(num_results * 4, 50) # Fetch 4x but cap at 50
96
+
97
+ results = collection.query(
98
  query_embeddings=[emb],
99
+ n_results=fetch_count,
100
  where=where_clause
101
  )
102
+
103
+ # Deduplicate and return only requested number
104
+ return deduplicate_results(results, num_results)
105
 
106
  # Enhanced Gradio UI
107
  def ui_search(query, num_results=3, source_filter="all"):
 
114
  if not r['documents'][0]:
115
  return "❌ No results found"
116
 
117
+ out = f"πŸ” Found {len(r['documents'][0])} unique results\n\n"
118
 
119
  for i in range(len(r['documents'][0])):
120
  source = r['metadatas'][0][i].get('source', 'unknown')
 
161
 
162
  Search across **Med-Gemini** (expert explanations) and **MedQA** (USMLE questions) databases.
163
  Uses medical-specific embeddings (MedCPT) for accurate retrieval.
164
+
165
+ ✨ **New**: Automatic deduplication removes similar/duplicate questions
166
  """)
167
 
168
  with gr.Row():