Spaces:
Running
Running
Commit
Β·
59afc84
1
Parent(s):
3bacbf8
fix(phase6): critical fixes from senior audit
Browse files- Fix Dockerfile cache ownership (run model download as appuser)
- Store full citation metadata in vector DB (title, date, authors)
- Remove fragile locals() hack in search_agent.py
- Add try/except in deduplicate loop for resilience
- Document threshold semantics (cosine distance math)
- Clean up redundant list() calls
- Dockerfile +8 -5
- src/agents/search_agent.py +17 -31
- src/services/embeddings.py +45 -10
Dockerfile
CHANGED
|
@@ -21,16 +21,19 @@ COPY README.md .
|
|
| 21 |
# Install dependencies
|
| 22 |
RUN uv sync --frozen --no-dev --all-extras
|
| 23 |
|
| 24 |
-
#
|
|
|
|
|
|
|
|
|
|
| 25 |
ENV HF_HOME=/app/.cache
|
| 26 |
ENV TRANSFORMERS_CACHE=/app/.cache
|
| 27 |
|
| 28 |
-
#
|
| 29 |
-
RUN
|
| 30 |
|
| 31 |
-
#
|
| 32 |
-
RUN useradd --create-home --shell /bin/bash appuser
|
| 33 |
USER appuser
|
|
|
|
| 34 |
|
| 35 |
# Expose port
|
| 36 |
EXPOSE 7860
|
|
|
|
| 21 |
# Install dependencies
|
| 22 |
RUN uv sync --frozen --no-dev --all-extras
|
| 23 |
|
| 24 |
+
# Create non-root user BEFORE downloading models
|
| 25 |
+
RUN useradd --create-home --shell /bin/bash appuser
|
| 26 |
+
|
| 27 |
+
# Set cache directory for HuggingFace models (must be writable by appuser)
|
| 28 |
ENV HF_HOME=/app/.cache
|
| 29 |
ENV TRANSFORMERS_CACHE=/app/.cache
|
| 30 |
|
| 31 |
+
# Create cache dir with correct ownership
|
| 32 |
+
RUN mkdir -p /app/.cache && chown -R appuser:appuser /app/.cache
|
| 33 |
|
| 34 |
+
# Pre-download the embedding model during build (as appuser to set correct ownership)
|
|
|
|
| 35 |
USER appuser
|
| 36 |
+
RUN uv run python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
|
| 37 |
|
| 38 |
# Expose port
|
| 39 |
EXPOSE 7860
|
src/agents/search_agent.py
CHANGED
|
@@ -66,6 +66,10 @@ class SearchAgent(BaseAgent): # type: ignore[misc]
|
|
| 66 |
# Execute search
|
| 67 |
result: SearchResult = await self._handler.execute(query, max_results_per_tool=10)
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
# Update shared evidence store
|
| 70 |
if self._embeddings:
|
| 71 |
# Deduplicate by semantic similarity (async-safe)
|
|
@@ -75,41 +79,32 @@ class SearchAgent(BaseAgent): # type: ignore[misc]
|
|
| 75 |
related = await self._embeddings.search_similar(query, n_results=5)
|
| 76 |
|
| 77 |
# Merge related evidence not already in results
|
| 78 |
-
# We need to reconstruct Evidence objects from stored data
|
| 79 |
existing_urls = {e.citation.url for e in unique_evidence}
|
| 80 |
|
| 81 |
-
#
|
| 82 |
-
|
| 83 |
-
# but we also want related from *previous searches*
|
| 84 |
-
|
| 85 |
-
related_evidence = []
|
| 86 |
for item in related:
|
| 87 |
if item["id"] not in existing_urls:
|
| 88 |
-
# Create Evidence from stored metadata
|
| 89 |
-
# Check if metadata has required fields
|
| 90 |
meta = item.get("metadata", {})
|
| 91 |
-
#
|
| 92 |
-
|
| 93 |
-
authors =
|
| 94 |
-
if isinstance(authors, str):
|
| 95 |
-
authors = [authors]
|
| 96 |
-
if not authors:
|
| 97 |
-
authors = ["Unknown"]
|
| 98 |
|
| 99 |
ev = Evidence(
|
| 100 |
content=item["content"],
|
| 101 |
citation=Citation(
|
| 102 |
-
title=meta.get("title", "
|
| 103 |
url=item["id"],
|
| 104 |
source=meta.get("source", "vector_db"),
|
| 105 |
-
date=date,
|
| 106 |
authors=authors,
|
| 107 |
),
|
| 108 |
-
|
|
|
|
| 109 |
)
|
| 110 |
related_evidence.append(ev)
|
| 111 |
|
| 112 |
-
# Combine
|
| 113 |
final_new_evidence = unique_evidence + related_evidence
|
| 114 |
|
| 115 |
# Add to global store (deduping against global store)
|
|
@@ -117,25 +112,16 @@ class SearchAgent(BaseAgent): # type: ignore[misc]
|
|
| 117 |
really_new = [e for e in final_new_evidence if e.citation.url not in global_urls]
|
| 118 |
self._evidence_store["current"].extend(really_new)
|
| 119 |
|
| 120 |
-
# Update result for reporting
|
| 121 |
total_new = len(really_new)
|
|
|
|
| 122 |
|
| 123 |
else:
|
| 124 |
-
# Fallback to URL-based deduplication
|
| 125 |
existing_urls = {e.citation.url for e in self._evidence_store["current"]}
|
| 126 |
new_unique = [e for e in result.evidence if e.citation.url not in existing_urls]
|
| 127 |
self._evidence_store["current"].extend(new_unique)
|
| 128 |
total_new = len(new_unique)
|
| 129 |
-
|
| 130 |
-
# Format response
|
| 131 |
-
# Get latest N items from store or just the new ones
|
| 132 |
-
# Let's show what was found in this run + related
|
| 133 |
-
|
| 134 |
-
evidence_to_show = (
|
| 135 |
-
(unique_evidence + related_evidence)
|
| 136 |
-
if self._embeddings and "unique_evidence" in locals()
|
| 137 |
-
else result.evidence
|
| 138 |
-
)
|
| 139 |
|
| 140 |
evidence_text = "\n".join(
|
| 141 |
[
|
|
|
|
| 66 |
# Execute search
|
| 67 |
result: SearchResult = await self._handler.execute(query, max_results_per_tool=10)
|
| 68 |
|
| 69 |
+
# Track what to show in response (initialized to search results as default)
|
| 70 |
+
evidence_to_show: list[Evidence] = result.evidence
|
| 71 |
+
total_new = 0
|
| 72 |
+
|
| 73 |
# Update shared evidence store
|
| 74 |
if self._embeddings:
|
| 75 |
# Deduplicate by semantic similarity (async-safe)
|
|
|
|
| 79 |
related = await self._embeddings.search_similar(query, n_results=5)
|
| 80 |
|
| 81 |
# Merge related evidence not already in results
|
|
|
|
| 82 |
existing_urls = {e.citation.url for e in unique_evidence}
|
| 83 |
|
| 84 |
+
# Reconstruct Evidence objects from stored vector DB data
|
| 85 |
+
related_evidence: list[Evidence] = []
|
|
|
|
|
|
|
|
|
|
| 86 |
for item in related:
|
| 87 |
if item["id"] not in existing_urls:
|
|
|
|
|
|
|
| 88 |
meta = item.get("metadata", {})
|
| 89 |
+
# Parse authors (stored as comma-separated string)
|
| 90 |
+
authors_str = meta.get("authors", "")
|
| 91 |
+
authors = [a.strip() for a in authors_str.split(",") if a.strip()]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
ev = Evidence(
|
| 94 |
content=item["content"],
|
| 95 |
citation=Citation(
|
| 96 |
+
title=meta.get("title", "Related Evidence"),
|
| 97 |
url=item["id"],
|
| 98 |
source=meta.get("source", "vector_db"),
|
| 99 |
+
date=meta.get("date", "n.d."),
|
| 100 |
authors=authors,
|
| 101 |
),
|
| 102 |
+
# Convert distance to relevance (lower distance = higher relevance)
|
| 103 |
+
relevance=max(0.0, 1.0 - item.get("distance", 0.5)),
|
| 104 |
)
|
| 105 |
related_evidence.append(ev)
|
| 106 |
|
| 107 |
+
# Combine unique from search + related from vector DB
|
| 108 |
final_new_evidence = unique_evidence + related_evidence
|
| 109 |
|
| 110 |
# Add to global store (deduping against global store)
|
|
|
|
| 112 |
really_new = [e for e in final_new_evidence if e.citation.url not in global_urls]
|
| 113 |
self._evidence_store["current"].extend(really_new)
|
| 114 |
|
|
|
|
| 115 |
total_new = len(really_new)
|
| 116 |
+
evidence_to_show = unique_evidence + related_evidence
|
| 117 |
|
| 118 |
else:
|
| 119 |
+
# Fallback to URL-based deduplication (no embeddings)
|
| 120 |
existing_urls = {e.citation.url for e in self._evidence_store["current"]}
|
| 121 |
new_unique = [e for e in result.evidence if e.citation.url not in existing_urls]
|
| 122 |
self._evidence_store["current"].extend(new_unique)
|
| 123 |
total_new = len(new_unique)
|
| 124 |
+
evidence_to_show = result.evidence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
evidence_text = "\n".join(
|
| 127 |
[
|
src/services/embeddings.py
CHANGED
|
@@ -33,11 +33,13 @@ class EmbeddingService:
|
|
| 33 |
|
| 34 |
def _sync_embed(self, text: str) -> list[float]:
|
| 35 |
"""Synchronous embedding - DO NOT call directly from async code."""
|
| 36 |
-
|
|
|
|
| 37 |
|
| 38 |
def _sync_batch_embed(self, texts: list[str]) -> list[list[float]]:
|
| 39 |
"""Batch embedding for efficiency - DO NOT call directly from async code."""
|
| 40 |
-
|
|
|
|
| 41 |
|
| 42 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
# Async public methods (safe for event loop)
|
|
@@ -107,17 +109,50 @@ class EmbeddingService:
|
|
| 107 |
async def deduplicate(
|
| 108 |
self, new_evidence: list[Evidence], threshold: float = 0.9
|
| 109 |
) -> list[Evidence]:
|
| 110 |
-
"""Remove semantically duplicate evidence (async-safe).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
unique = []
|
| 112 |
for evidence in new_evidence:
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
)
|
|
|
|
|
|
|
|
|
|
| 121 |
return unique
|
| 122 |
|
| 123 |
|
|
|
|
| 33 |
|
| 34 |
def _sync_embed(self, text: str) -> list[float]:
|
| 35 |
"""Synchronous embedding - DO NOT call directly from async code."""
|
| 36 |
+
result: list[float] = self._model.encode(text).tolist()
|
| 37 |
+
return result
|
| 38 |
|
| 39 |
def _sync_batch_embed(self, texts: list[str]) -> list[list[float]]:
|
| 40 |
"""Batch embedding for efficiency - DO NOT call directly from async code."""
|
| 41 |
+
embeddings = self._model.encode(texts)
|
| 42 |
+
return [e.tolist() for e in embeddings]
|
| 43 |
|
| 44 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
# Async public methods (safe for event loop)
|
|
|
|
| 109 |
async def deduplicate(
|
| 110 |
self, new_evidence: list[Evidence], threshold: float = 0.9
|
| 111 |
) -> list[Evidence]:
|
| 112 |
+
"""Remove semantically duplicate evidence (async-safe).
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
new_evidence: List of evidence items to deduplicate
|
| 116 |
+
threshold: Similarity threshold (0.9 = 90% similar is duplicate).
|
| 117 |
+
ChromaDB cosine distance: 0=identical, 2=opposite.
|
| 118 |
+
We consider duplicate if distance < (1 - threshold).
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
List of unique evidence items (not already in vector store).
|
| 122 |
+
"""
|
| 123 |
unique = []
|
| 124 |
for evidence in new_evidence:
|
| 125 |
+
try:
|
| 126 |
+
similar = await self.search_similar(evidence.content, n_results=1)
|
| 127 |
+
# ChromaDB cosine distance: 0 = identical, 2 = opposite
|
| 128 |
+
# threshold=0.9 means distance < 0.1 is considered duplicate
|
| 129 |
+
is_duplicate = similar and similar[0]["distance"] < (1 - threshold)
|
| 130 |
+
|
| 131 |
+
if not is_duplicate:
|
| 132 |
+
unique.append(evidence)
|
| 133 |
+
# Store FULL citation metadata for reconstruction later
|
| 134 |
+
await self.add_evidence(
|
| 135 |
+
evidence_id=evidence.citation.url,
|
| 136 |
+
content=evidence.content,
|
| 137 |
+
metadata={
|
| 138 |
+
"source": evidence.citation.source,
|
| 139 |
+
"title": evidence.citation.title,
|
| 140 |
+
"date": evidence.citation.date,
|
| 141 |
+
"authors": ",".join(evidence.citation.authors or []),
|
| 142 |
+
},
|
| 143 |
+
)
|
| 144 |
+
except Exception as e:
|
| 145 |
+
# Log but don't fail entire deduplication for one bad item
|
| 146 |
+
import structlog
|
| 147 |
+
|
| 148 |
+
structlog.get_logger().warning(
|
| 149 |
+
"Failed to process evidence in deduplicate",
|
| 150 |
+
url=evidence.citation.url,
|
| 151 |
+
error=str(e),
|
| 152 |
)
|
| 153 |
+
# Still add to unique list - better to have duplicates than lose data
|
| 154 |
+
unique.append(evidence)
|
| 155 |
+
|
| 156 |
return unique
|
| 157 |
|
| 158 |
|