Spaces:

DataQuests
/

DeepCritical

Running

VibecoderMcSwaggins commited on 13 days ago

Commit

59afc84

1 Parent(s): 3bacbf8

fix(phase6): critical fixes from senior audit

- Fix Dockerfile cache ownership (run model download as appuser)
- Store full citation metadata in vector DB (title, date, authors)
- Remove fragile locals() hack in search_agent.py
- Add try/except in deduplicate loop for resilience
- Document threshold semantics (cosine distance math)
- Clean up redundant list() calls

Files changed (3) hide show

Dockerfile +8 -5
src/agents/search_agent.py +17 -31
src/services/embeddings.py +45 -10

Dockerfile CHANGED Viewed

@@ -21,16 +21,19 @@ COPY README.md .
 # Install dependencies
 RUN uv sync --frozen --no-dev --all-extras
-# Set cache directory for HuggingFace models
 ENV HF_HOME=/app/.cache
 ENV TRANSFORMERS_CACHE=/app/.cache
-# Pre-download the embedding model during build to speed up cold starts
-RUN uv run python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
-# Create non-root user
-RUN useradd --create-home --shell /bin/bash appuser
 USER appuser
 # Expose port
 EXPOSE 7860

 # Install dependencies
 RUN uv sync --frozen --no-dev --all-extras
+# Create non-root user BEFORE downloading models
+RUN useradd --create-home --shell /bin/bash appuser
+# Set cache directory for HuggingFace models (must be writable by appuser)
 ENV HF_HOME=/app/.cache
 ENV TRANSFORMERS_CACHE=/app/.cache
+# Create cache dir with correct ownership
+RUN mkdir -p /app/.cache && chown -R appuser:appuser /app/.cache
+# Pre-download the embedding model during build (as appuser to set correct ownership)
 USER appuser
+RUN uv run python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
 # Expose port
 EXPOSE 7860

src/agents/search_agent.py CHANGED Viewed

@@ -66,6 +66,10 @@ class SearchAgent(BaseAgent):  # type: ignore[misc]
         # Execute search
         result: SearchResult = await self._handler.execute(query, max_results_per_tool=10)
         # Update shared evidence store
         if self._embeddings:
             # Deduplicate by semantic similarity (async-safe)
@@ -75,41 +79,32 @@ class SearchAgent(BaseAgent):  # type: ignore[misc]
             related = await self._embeddings.search_similar(query, n_results=5)
             # Merge related evidence not already in results
-            # We need to reconstruct Evidence objects from stored data
             existing_urls = {e.citation.url for e in unique_evidence}
-            # Also check what's already in the global store to avoid re-adding
-            # logic here is a bit complex: deduplicate returned unique from *new search*
-            # but we also want related from *previous searches*
-            related_evidence = []
             for item in related:
                 if item["id"] not in existing_urls:
-                    # Create Evidence from stored metadata
-                    # Check if metadata has required fields
                     meta = item.get("metadata", {})
-                    # Fallback if date missing
-                    date = meta.get("date") or "n.d."
-                    authors = meta.get("authors")  # Might be list or string depending on how stored
-                    if isinstance(authors, str):
-                        authors = [authors]
-                    if not authors:
-                        authors = ["Unknown"]
                     ev = Evidence(
                         content=item["content"],
                         citation=Citation(
-                            title=meta.get("title", "Untitled"),
                             url=item["id"],
                             source=meta.get("source", "vector_db"),
-                            date=date,
                             authors=authors,
                         ),
-                        relevance=item.get("distance", 0.0),  # Use distance/similarity as proxy
                     )
                     related_evidence.append(ev)
-            # Combine
             final_new_evidence = unique_evidence + related_evidence
             # Add to global store (deduping against global store)
@@ -117,25 +112,16 @@ class SearchAgent(BaseAgent):  # type: ignore[misc]
             really_new = [e for e in final_new_evidence if e.citation.url not in global_urls]
             self._evidence_store["current"].extend(really_new)
-            # Update result for reporting
             total_new = len(really_new)
         else:
-            # Fallback to URL-based deduplication
             existing_urls = {e.citation.url for e in self._evidence_store["current"]}
             new_unique = [e for e in result.evidence if e.citation.url not in existing_urls]
             self._evidence_store["current"].extend(new_unique)
             total_new = len(new_unique)
-        # Format response
-        # Get latest N items from store or just the new ones
-        # Let's show what was found in this run + related
-        evidence_to_show = (
-            (unique_evidence + related_evidence)
-            if self._embeddings and "unique_evidence" in locals()
-            else result.evidence
-        )
         evidence_text = "\n".join(
             [

         # Execute search
         result: SearchResult = await self._handler.execute(query, max_results_per_tool=10)
+        # Track what to show in response (initialized to search results as default)
+        evidence_to_show: list[Evidence] = result.evidence
+        total_new = 0
         # Update shared evidence store
         if self._embeddings:
             # Deduplicate by semantic similarity (async-safe)
             related = await self._embeddings.search_similar(query, n_results=5)
             # Merge related evidence not already in results
             existing_urls = {e.citation.url for e in unique_evidence}
+            # Reconstruct Evidence objects from stored vector DB data
+            related_evidence: list[Evidence] = []
             for item in related:
                 if item["id"] not in existing_urls:
                     meta = item.get("metadata", {})
+                    # Parse authors (stored as comma-separated string)
+                    authors_str = meta.get("authors", "")
+                    authors = [a.strip() for a in authors_str.split(",") if a.strip()]
                     ev = Evidence(
                         content=item["content"],
                         citation=Citation(
+                            title=meta.get("title", "Related Evidence"),
                             url=item["id"],
                             source=meta.get("source", "vector_db"),
+                            date=meta.get("date", "n.d."),
                             authors=authors,
                         ),
+                        # Convert distance to relevance (lower distance = higher relevance)
+                        relevance=max(0.0, 1.0 - item.get("distance", 0.5)),
                     )
                     related_evidence.append(ev)
+            # Combine unique from search + related from vector DB
             final_new_evidence = unique_evidence + related_evidence
             # Add to global store (deduping against global store)
             really_new = [e for e in final_new_evidence if e.citation.url not in global_urls]
             self._evidence_store["current"].extend(really_new)
             total_new = len(really_new)
+            evidence_to_show = unique_evidence + related_evidence
         else:
+            # Fallback to URL-based deduplication (no embeddings)
             existing_urls = {e.citation.url for e in self._evidence_store["current"]}
             new_unique = [e for e in result.evidence if e.citation.url not in existing_urls]
             self._evidence_store["current"].extend(new_unique)
             total_new = len(new_unique)
+            evidence_to_show = result.evidence
         evidence_text = "\n".join(
             [

src/services/embeddings.py CHANGED Viewed

@@ -33,11 +33,13 @@ class EmbeddingService:
     def _sync_embed(self, text: str) -> list[float]:
         """Synchronous embedding - DO NOT call directly from async code."""
-        return list(self._model.encode(text).tolist())
     def _sync_batch_embed(self, texts: list[str]) -> list[list[float]]:
         """Batch embedding for efficiency - DO NOT call directly from async code."""
-        return [list(e.tolist()) for e in self._model.encode(texts)]
     # ─────────────────────────────────────────────────────────────────
     # Async public methods (safe for event loop)
@@ -107,17 +109,50 @@ class EmbeddingService:
     async def deduplicate(
         self, new_evidence: list[Evidence], threshold: float = 0.9
     ) -> list[Evidence]:
-        """Remove semantically duplicate evidence (async-safe)."""
         unique = []
         for evidence in new_evidence:
-            similar = await self.search_similar(evidence.content, n_results=1)
-            if not similar or similar[0]["distance"] > (1 - threshold):
-                unique.append(evidence)
-                await self.add_evidence(
-                    evidence_id=evidence.citation.url,
-                    content=evidence.content,
-                    metadata={"source": evidence.citation.source},
                 )
         return unique

     def _sync_embed(self, text: str) -> list[float]:
         """Synchronous embedding - DO NOT call directly from async code."""
+        result: list[float] = self._model.encode(text).tolist()
+        return result
     def _sync_batch_embed(self, texts: list[str]) -> list[list[float]]:
         """Batch embedding for efficiency - DO NOT call directly from async code."""
+        embeddings = self._model.encode(texts)
+        return [e.tolist() for e in embeddings]
     # ─────────────────────────────────────────────────────────────────
     # Async public methods (safe for event loop)
     async def deduplicate(
         self, new_evidence: list[Evidence], threshold: float = 0.9
     ) -> list[Evidence]:
+        """Remove semantically duplicate evidence (async-safe).
+        Args:
+            new_evidence: List of evidence items to deduplicate
+            threshold: Similarity threshold (0.9 = 90% similar is duplicate).
+                      ChromaDB cosine distance: 0=identical, 2=opposite.
+                      We consider duplicate if distance < (1 - threshold).
+        Returns:
+            List of unique evidence items (not already in vector store).
+        """
         unique = []
         for evidence in new_evidence:
+            try:
+                similar = await self.search_similar(evidence.content, n_results=1)
+                # ChromaDB cosine distance: 0 = identical, 2 = opposite
+                # threshold=0.9 means distance < 0.1 is considered duplicate
+                is_duplicate = similar and similar[0]["distance"] < (1 - threshold)
+                if not is_duplicate:
+                    unique.append(evidence)
+                    # Store FULL citation metadata for reconstruction later
+                    await self.add_evidence(
+                        evidence_id=evidence.citation.url,
+                        content=evidence.content,
+                        metadata={
+                            "source": evidence.citation.source,
+                            "title": evidence.citation.title,
+                            "date": evidence.citation.date,
+                            "authors": ",".join(evidence.citation.authors or []),
+                        },
+                    )
+            except Exception as e:
+                # Log but don't fail entire deduplication for one bad item
+                import structlog
+                structlog.get_logger().warning(
+                    "Failed to process evidence in deduplicate",
+                    url=evidence.citation.url,
+                    error=str(e),
                 )
+                # Still add to unique list - better to have duplicates than lose data
+                unique.append(evidence)
         return unique