VibecoderMcSwaggins commited on
Commit
59afc84
Β·
1 Parent(s): 3bacbf8

fix(phase6): critical fixes from senior audit

Browse files

- Fix Dockerfile cache ownership (run model download as appuser)
- Store full citation metadata in vector DB (title, date, authors)
- Remove fragile locals() hack in search_agent.py
- Add try/except in deduplicate loop for resilience
- Document threshold semantics (cosine distance math)
- Clean up redundant list() calls

Dockerfile CHANGED
@@ -21,16 +21,19 @@ COPY README.md .
21
  # Install dependencies
22
  RUN uv sync --frozen --no-dev --all-extras
23
 
24
- # Set cache directory for HuggingFace models
 
 
 
25
  ENV HF_HOME=/app/.cache
26
  ENV TRANSFORMERS_CACHE=/app/.cache
27
 
28
- # Pre-download the embedding model during build to speed up cold starts
29
- RUN uv run python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
30
 
31
- # Create non-root user
32
- RUN useradd --create-home --shell /bin/bash appuser
33
  USER appuser
 
34
 
35
  # Expose port
36
  EXPOSE 7860
 
21
  # Install dependencies
22
  RUN uv sync --frozen --no-dev --all-extras
23
 
24
+ # Create non-root user BEFORE downloading models
25
+ RUN useradd --create-home --shell /bin/bash appuser
26
+
27
+ # Set cache directory for HuggingFace models (must be writable by appuser)
28
  ENV HF_HOME=/app/.cache
29
  ENV TRANSFORMERS_CACHE=/app/.cache
30
 
31
+ # Create cache dir with correct ownership
32
+ RUN mkdir -p /app/.cache && chown -R appuser:appuser /app/.cache
33
 
34
+ # Pre-download the embedding model during build (as appuser to set correct ownership)
 
35
  USER appuser
36
+ RUN uv run python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
37
 
38
  # Expose port
39
  EXPOSE 7860
src/agents/search_agent.py CHANGED
@@ -66,6 +66,10 @@ class SearchAgent(BaseAgent): # type: ignore[misc]
66
  # Execute search
67
  result: SearchResult = await self._handler.execute(query, max_results_per_tool=10)
68
 
 
 
 
 
69
  # Update shared evidence store
70
  if self._embeddings:
71
  # Deduplicate by semantic similarity (async-safe)
@@ -75,41 +79,32 @@ class SearchAgent(BaseAgent): # type: ignore[misc]
75
  related = await self._embeddings.search_similar(query, n_results=5)
76
 
77
  # Merge related evidence not already in results
78
- # We need to reconstruct Evidence objects from stored data
79
  existing_urls = {e.citation.url for e in unique_evidence}
80
 
81
- # Also check what's already in the global store to avoid re-adding
82
- # logic here is a bit complex: deduplicate returned unique from *new search*
83
- # but we also want related from *previous searches*
84
-
85
- related_evidence = []
86
  for item in related:
87
  if item["id"] not in existing_urls:
88
- # Create Evidence from stored metadata
89
- # Check if metadata has required fields
90
  meta = item.get("metadata", {})
91
- # Fallback if date missing
92
- date = meta.get("date") or "n.d."
93
- authors = meta.get("authors") # Might be list or string depending on how stored
94
- if isinstance(authors, str):
95
- authors = [authors]
96
- if not authors:
97
- authors = ["Unknown"]
98
 
99
  ev = Evidence(
100
  content=item["content"],
101
  citation=Citation(
102
- title=meta.get("title", "Untitled"),
103
  url=item["id"],
104
  source=meta.get("source", "vector_db"),
105
- date=date,
106
  authors=authors,
107
  ),
108
- relevance=item.get("distance", 0.0), # Use distance/similarity as proxy
 
109
  )
110
  related_evidence.append(ev)
111
 
112
- # Combine
113
  final_new_evidence = unique_evidence + related_evidence
114
 
115
  # Add to global store (deduping against global store)
@@ -117,25 +112,16 @@ class SearchAgent(BaseAgent): # type: ignore[misc]
117
  really_new = [e for e in final_new_evidence if e.citation.url not in global_urls]
118
  self._evidence_store["current"].extend(really_new)
119
 
120
- # Update result for reporting
121
  total_new = len(really_new)
 
122
 
123
  else:
124
- # Fallback to URL-based deduplication
125
  existing_urls = {e.citation.url for e in self._evidence_store["current"]}
126
  new_unique = [e for e in result.evidence if e.citation.url not in existing_urls]
127
  self._evidence_store["current"].extend(new_unique)
128
  total_new = len(new_unique)
129
-
130
- # Format response
131
- # Get latest N items from store or just the new ones
132
- # Let's show what was found in this run + related
133
-
134
- evidence_to_show = (
135
- (unique_evidence + related_evidence)
136
- if self._embeddings and "unique_evidence" in locals()
137
- else result.evidence
138
- )
139
 
140
  evidence_text = "\n".join(
141
  [
 
66
  # Execute search
67
  result: SearchResult = await self._handler.execute(query, max_results_per_tool=10)
68
 
69
+ # Track what to show in response (initialized to search results as default)
70
+ evidence_to_show: list[Evidence] = result.evidence
71
+ total_new = 0
72
+
73
  # Update shared evidence store
74
  if self._embeddings:
75
  # Deduplicate by semantic similarity (async-safe)
 
79
  related = await self._embeddings.search_similar(query, n_results=5)
80
 
81
  # Merge related evidence not already in results
 
82
  existing_urls = {e.citation.url for e in unique_evidence}
83
 
84
+ # Reconstruct Evidence objects from stored vector DB data
85
+ related_evidence: list[Evidence] = []
 
 
 
86
  for item in related:
87
  if item["id"] not in existing_urls:
 
 
88
  meta = item.get("metadata", {})
89
+ # Parse authors (stored as comma-separated string)
90
+ authors_str = meta.get("authors", "")
91
+ authors = [a.strip() for a in authors_str.split(",") if a.strip()]
 
 
 
 
92
 
93
  ev = Evidence(
94
  content=item["content"],
95
  citation=Citation(
96
+ title=meta.get("title", "Related Evidence"),
97
  url=item["id"],
98
  source=meta.get("source", "vector_db"),
99
+ date=meta.get("date", "n.d."),
100
  authors=authors,
101
  ),
102
+ # Convert distance to relevance (lower distance = higher relevance)
103
+ relevance=max(0.0, 1.0 - item.get("distance", 0.5)),
104
  )
105
  related_evidence.append(ev)
106
 
107
+ # Combine unique from search + related from vector DB
108
  final_new_evidence = unique_evidence + related_evidence
109
 
110
  # Add to global store (deduping against global store)
 
112
  really_new = [e for e in final_new_evidence if e.citation.url not in global_urls]
113
  self._evidence_store["current"].extend(really_new)
114
 
 
115
  total_new = len(really_new)
116
+ evidence_to_show = unique_evidence + related_evidence
117
 
118
  else:
119
+ # Fallback to URL-based deduplication (no embeddings)
120
  existing_urls = {e.citation.url for e in self._evidence_store["current"]}
121
  new_unique = [e for e in result.evidence if e.citation.url not in existing_urls]
122
  self._evidence_store["current"].extend(new_unique)
123
  total_new = len(new_unique)
124
+ evidence_to_show = result.evidence
 
 
 
 
 
 
 
 
 
125
 
126
  evidence_text = "\n".join(
127
  [
src/services/embeddings.py CHANGED
@@ -33,11 +33,13 @@ class EmbeddingService:
33
 
34
  def _sync_embed(self, text: str) -> list[float]:
35
  """Synchronous embedding - DO NOT call directly from async code."""
36
- return list(self._model.encode(text).tolist())
 
37
 
38
  def _sync_batch_embed(self, texts: list[str]) -> list[list[float]]:
39
  """Batch embedding for efficiency - DO NOT call directly from async code."""
40
- return [list(e.tolist()) for e in self._model.encode(texts)]
 
41
 
42
  # ─────────────────────────────────────────────────────────────────
43
  # Async public methods (safe for event loop)
@@ -107,17 +109,50 @@ class EmbeddingService:
107
  async def deduplicate(
108
  self, new_evidence: list[Evidence], threshold: float = 0.9
109
  ) -> list[Evidence]:
110
- """Remove semantically duplicate evidence (async-safe)."""
 
 
 
 
 
 
 
 
 
 
111
  unique = []
112
  for evidence in new_evidence:
113
- similar = await self.search_similar(evidence.content, n_results=1)
114
- if not similar or similar[0]["distance"] > (1 - threshold):
115
- unique.append(evidence)
116
- await self.add_evidence(
117
- evidence_id=evidence.citation.url,
118
- content=evidence.content,
119
- metadata={"source": evidence.citation.source},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  )
 
 
 
121
  return unique
122
 
123
 
 
33
 
34
  def _sync_embed(self, text: str) -> list[float]:
35
  """Synchronous embedding - DO NOT call directly from async code."""
36
+ result: list[float] = self._model.encode(text).tolist()
37
+ return result
38
 
39
  def _sync_batch_embed(self, texts: list[str]) -> list[list[float]]:
40
  """Batch embedding for efficiency - DO NOT call directly from async code."""
41
+ embeddings = self._model.encode(texts)
42
+ return [e.tolist() for e in embeddings]
43
 
44
  # ─────────────────────────────────────────────────────────────────
45
  # Async public methods (safe for event loop)
 
109
  async def deduplicate(
110
  self, new_evidence: list[Evidence], threshold: float = 0.9
111
  ) -> list[Evidence]:
112
+ """Remove semantically duplicate evidence (async-safe).
113
+
114
+ Args:
115
+ new_evidence: List of evidence items to deduplicate
116
+ threshold: Similarity threshold (0.9 = 90% similar is duplicate).
117
+ ChromaDB cosine distance: 0=identical, 2=opposite.
118
+ We consider duplicate if distance < (1 - threshold).
119
+
120
+ Returns:
121
+ List of unique evidence items (not already in vector store).
122
+ """
123
  unique = []
124
  for evidence in new_evidence:
125
+ try:
126
+ similar = await self.search_similar(evidence.content, n_results=1)
127
+ # ChromaDB cosine distance: 0 = identical, 2 = opposite
128
+ # threshold=0.9 means distance < 0.1 is considered duplicate
129
+ is_duplicate = similar and similar[0]["distance"] < (1 - threshold)
130
+
131
+ if not is_duplicate:
132
+ unique.append(evidence)
133
+ # Store FULL citation metadata for reconstruction later
134
+ await self.add_evidence(
135
+ evidence_id=evidence.citation.url,
136
+ content=evidence.content,
137
+ metadata={
138
+ "source": evidence.citation.source,
139
+ "title": evidence.citation.title,
140
+ "date": evidence.citation.date,
141
+ "authors": ",".join(evidence.citation.authors or []),
142
+ },
143
+ )
144
+ except Exception as e:
145
+ # Log but don't fail entire deduplication for one bad item
146
+ import structlog
147
+
148
+ structlog.get_logger().warning(
149
+ "Failed to process evidence in deduplicate",
150
+ url=evidence.citation.url,
151
+ error=str(e),
152
  )
153
+ # Still add to unique list - better to have duplicates than lose data
154
+ unique.append(evidence)
155
+
156
  return unique
157
 
158