feat(document-matching): implement two-stage document matching with LLM reranking
Browse files- Add hybrid keyword scoring + LLM semantic reranking for document matching
- Implement Stage 1 fast keyword scoring with weighted phrase and word-level matching
- Implement Stage 2 LLM-based semantic reranking for top candidates (up to 8 docs)
- Update README with detailed explanation of two-stage matching system
- Refactor resolve_relevant_document_hashes to use new scoring algorithm
- Add _llm_verify_document_hashes method for LLM-based document ranking
- Remove deployment configs (fly.toml, render.yaml) no longer in use
- Update test documents with new sample file
- Improve retrieval accuracy by balancing speed (keyword filtering) with semantic understanding (LLM reranking)
- README.md +17 -8
- app/services/document_service.py +100 -36
- fly.toml +0 -22
- render.yaml +0 -32
README.md
CHANGED
|
@@ -27,9 +27,10 @@ Uploaded PDFs are parsed page by page and split into chunks.
|
|
| 27 |
Each chunk is stored with metadata (document, page number, chunk index) and embedded into `pgvector`.
|
| 28 |
|
| 29 |
At question time:
|
| 30 |
-
1.
|
| 31 |
-
2.
|
| 32 |
-
3.
|
|
|
|
| 33 |
|
| 34 |
## Chunking Strategy
|
| 35 |
|
|
@@ -62,7 +63,7 @@ Each turn stores/returns source metadata separately from the answer body.
|
|
| 62 |
- Vector source cards include:
|
| 63 |
- document name
|
| 64 |
- page number
|
| 65 |
-
-
|
| 66 |
- Web source cards include:
|
| 67 |
- title
|
| 68 |
- URL
|
|
@@ -83,6 +84,13 @@ Why I chose this:
|
|
| 83 |
- avoids duplicate indexing,
|
| 84 |
- keeps retrieval secure per user.
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
## Challenges I Ran Into
|
| 87 |
|
| 88 |
1. Heavy embedding dependencies made deployment images too large.
|
|
@@ -94,10 +102,11 @@ Why I chose this:
|
|
| 94 |
|
| 95 |
## If I Had More Time
|
| 96 |
|
| 97 |
-
- Add
|
| 98 |
-
- Add
|
| 99 |
-
- Add
|
| 100 |
-
- Add
|
|
|
|
| 101 |
|
| 102 |
## Local Setup
|
| 103 |
|
|
|
|
| 27 |
Each chunk is stored with metadata (document, page number, chunk index) and embedded into `pgvector`.
|
| 28 |
|
| 29 |
At question time:
|
| 30 |
+
1. Document matching uses keyword scoring + LLM semantic reranking
|
| 31 |
+
2. Relevant chunks are retrieved from matched documents via vector search
|
| 32 |
+
3. The agent answers from those chunks when possible
|
| 33 |
+
4. If evidence is weak, the agent uses web search and cites external URLs
|
| 34 |
|
| 35 |
## Chunking Strategy
|
| 36 |
|
|
|
|
| 63 |
- Vector source cards include:
|
| 64 |
- document name
|
| 65 |
- page number
|
| 66 |
+
- snippet (short snippet from retrieved chunk)
|
| 67 |
- Web source cards include:
|
| 68 |
- title
|
| 69 |
- URL
|
|
|
|
| 84 |
- avoids duplicate indexing,
|
| 85 |
- keeps retrieval secure per user.
|
| 86 |
|
| 87 |
+
I also implemented a two-stage document matching system:
|
| 88 |
+
|
| 89 |
+
- Stage 1: Fast keyword scoring checks exact phrase matches and word-level matches across filename, summary, and preview text with weighted scoring (filename matches score higher than preview matches).
|
| 90 |
+
- Stage 2: LLM semantic reranking takes the top scored candidates (up to 8) and reranks them based on semantic similarity to the query.
|
| 91 |
+
|
| 92 |
+
This hybrid approach balances speed and accuracy - keyword filtering is fast and catches obvious matches, while the LLM handles nuanced semantic understanding without processing every document.
|
| 93 |
+
|
| 94 |
## Challenges I Ran Into
|
| 95 |
|
| 96 |
1. Heavy embedding dependencies made deployment images too large.
|
|
|
|
| 102 |
|
| 103 |
## If I Had More Time
|
| 104 |
|
| 105 |
+
- Add conversation history UI to display past chat sessions
|
| 106 |
+
- Add reranking (cross-encoder) for better precision on long multi-doc queries
|
| 107 |
+
- Add automated citation-faithfulness checks
|
| 108 |
+
- Add Alembic migrations for cleaner schema evolution
|
| 109 |
+
- Add stronger eval/observability for routing and retrieval quality
|
| 110 |
|
| 111 |
## Local Setup
|
| 112 |
|
app/services/document_service.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
import hashlib
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from fastapi import UploadFile
|
| 4 |
from langchain_groq import ChatGroq
|
|
@@ -18,6 +20,7 @@ class DocumentService:
|
|
| 18 |
self.storage = StorageService()
|
| 19 |
self.vector_store = VectorStoreService()
|
| 20 |
self.summarizer = None
|
|
|
|
| 21 |
|
| 22 |
async def save_upload(self, upload: UploadFile) -> tuple[bytes, str]:
|
| 23 |
content = await upload.read()
|
|
@@ -129,45 +132,106 @@ class DocumentService:
|
|
| 129 |
}
|
| 130 |
|
| 131 |
def resolve_relevant_document_hashes(self, db: Session, *, user: User, query: str, limit: int = 5) -> list[str]:
|
| 132 |
-
stopwords = {
|
| 133 |
-
"the",
|
| 134 |
-
"and",
|
| 135 |
-
"for",
|
| 136 |
-
"with",
|
| 137 |
-
"from",
|
| 138 |
-
"that",
|
| 139 |
-
"this",
|
| 140 |
-
"what",
|
| 141 |
-
"who",
|
| 142 |
-
"how",
|
| 143 |
-
"are",
|
| 144 |
-
"was",
|
| 145 |
-
"were",
|
| 146 |
-
"is",
|
| 147 |
-
"of",
|
| 148 |
-
"about",
|
| 149 |
-
"tell",
|
| 150 |
-
"more",
|
| 151 |
-
"please",
|
| 152 |
-
"can",
|
| 153 |
-
"you",
|
| 154 |
-
"your",
|
| 155 |
-
}
|
| 156 |
-
terms = [term.strip() for term in query.lower().split() if len(term.strip()) > 2 and term.strip() not in stopwords]
|
| 157 |
docs = self.list_user_documents(db, user)
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
for doc in docs:
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
if score > 0:
|
| 165 |
-
scored.append((score, doc.file_hash))
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
def ensure_page_metadata_for_user(self, *, db: Session, user: User) -> None:
|
| 173 |
docs = self.list_user_documents(db, user)
|
|
|
|
| 1 |
import hashlib
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
|
| 5 |
from fastapi import UploadFile
|
| 6 |
from langchain_groq import ChatGroq
|
|
|
|
| 20 |
self.storage = StorageService()
|
| 21 |
self.vector_store = VectorStoreService()
|
| 22 |
self.summarizer = None
|
| 23 |
+
self.matcher_llm = None
|
| 24 |
|
| 25 |
async def save_upload(self, upload: UploadFile) -> tuple[bytes, str]:
|
| 26 |
content = await upload.read()
|
|
|
|
| 132 |
}
|
| 133 |
|
| 134 |
def resolve_relevant_document_hashes(self, db: Session, *, user: User, query: str, limit: int = 5) -> list[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
docs = self.list_user_documents(db, user)
|
| 136 |
+
if not docs:
|
| 137 |
+
return []
|
| 138 |
+
|
| 139 |
+
query_lower = query.lower()
|
| 140 |
+
scored: list[tuple[float, str, Document]] = []
|
| 141 |
+
|
| 142 |
for doc in docs:
|
| 143 |
+
score = 0.0
|
| 144 |
+
|
| 145 |
+
# Exact phrase matching (highest priority)
|
| 146 |
+
if query_lower in (doc.filename or "").lower():
|
| 147 |
+
score += 10.0
|
| 148 |
+
if query_lower in (doc.summary or "").lower():
|
| 149 |
+
score += 5.0
|
| 150 |
+
if query_lower in (doc.extracted_preview or "").lower():
|
| 151 |
+
score += 2.0
|
| 152 |
+
|
| 153 |
+
# Word-level matching
|
| 154 |
+
query_words = query_lower.split()
|
| 155 |
+
filename_lower = (doc.filename or "").lower()
|
| 156 |
+
summary_lower = (doc.summary or "").lower()
|
| 157 |
+
preview_lower = (doc.extracted_preview or "").lower()
|
| 158 |
+
|
| 159 |
+
for word in query_words:
|
| 160 |
+
if len(word) > 2: # Skip very short words
|
| 161 |
+
if word in filename_lower:
|
| 162 |
+
score += 3.0
|
| 163 |
+
if word in summary_lower:
|
| 164 |
+
score += 1.5
|
| 165 |
+
if word in preview_lower:
|
| 166 |
+
score += 0.5
|
| 167 |
+
|
| 168 |
if score > 0:
|
| 169 |
+
scored.append((score, doc.file_hash, doc))
|
| 170 |
+
|
| 171 |
+
# Sort by score
|
| 172 |
+
scored.sort(reverse=True, key=lambda x: x[0])
|
| 173 |
+
|
| 174 |
+
# Take top candidates for LLM (up to 8)
|
| 175 |
+
candidates_count = min(max(limit * 2, 8), len(scored)) if scored else min(limit, len(docs))
|
| 176 |
+
|
| 177 |
+
if scored:
|
| 178 |
+
ranked_docs = [doc for _, _, doc in scored[:candidates_count]]
|
| 179 |
+
ranked_hashes = [file_hash for _, file_hash, _ in scored[:candidates_count]]
|
| 180 |
+
else:
|
| 181 |
+
# No keyword matches, use all docs up to limit
|
| 182 |
+
ranked_docs = docs[:candidates_count]
|
| 183 |
+
ranked_hashes = [doc.file_hash for doc in ranked_docs]
|
| 184 |
+
|
| 185 |
+
# LLM reranking
|
| 186 |
+
llm_ranked_hashes = self._llm_verify_document_hashes(query=query, candidates=ranked_docs, limit=limit)
|
| 187 |
+
|
| 188 |
+
# Merge: LLM results first, then keyword fallback
|
| 189 |
+
merged = llm_ranked_hashes + [h for h in ranked_hashes if h not in llm_ranked_hashes]
|
| 190 |
+
return merged[:limit]
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def _llm_verify_document_hashes(self, *, query: str, candidates: list[Document], limit: int) -> list[str]:
|
| 195 |
+
if not self.settings.groq_api_key or not candidates:
|
| 196 |
+
return []
|
| 197 |
+
if self.matcher_llm is None:
|
| 198 |
+
self.matcher_llm = ChatGroq(api_key=self.settings.groq_api_key, model=self.settings.model_name, temperature=0)
|
| 199 |
+
|
| 200 |
+
payload = []
|
| 201 |
+
for doc in candidates[:8]:
|
| 202 |
+
payload.append(
|
| 203 |
+
{
|
| 204 |
+
"file_hash": doc.file_hash,
|
| 205 |
+
"filename": doc.filename,
|
| 206 |
+
"summary": (doc.summary or "")[:1000],
|
| 207 |
+
"preview": (doc.extracted_preview or "")[:1200],
|
| 208 |
+
}
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
prompt = (
|
| 212 |
+
"Rank the most relevant documents for the user query based on semantic similarity.\n"
|
| 213 |
+
"Return ONLY valid JSON with this exact schema:\n"
|
| 214 |
+
'{"file_hashes": ["<hash1>", "<hash2>"]}\n'
|
| 215 |
+
f"Return at most {limit} hashes ordered by relevance.\n\n"
|
| 216 |
+
f"User query:\n{query}\n\n"
|
| 217 |
+
f"Candidates:\n{json.dumps(payload, ensure_ascii=True)}"
|
| 218 |
+
)
|
| 219 |
+
try:
|
| 220 |
+
response = self.matcher_llm.invoke(prompt)
|
| 221 |
+
content = response.content if isinstance(response.content, str) else str(response.content)
|
| 222 |
+
|
| 223 |
+
# Handle markdown code blocks
|
| 224 |
+
if "```json" in content:
|
| 225 |
+
content = content.split("```json")[1].split("```")[0].strip()
|
| 226 |
+
elif "```" in content:
|
| 227 |
+
content = content.split("```")[1].split("```")[0].strip()
|
| 228 |
+
|
| 229 |
+
data = json.loads(content)
|
| 230 |
+
hashes = data.get("file_hashes", [])
|
| 231 |
+
valid = {item.get("file_hash", "") for item in payload}
|
| 232 |
+
return [value for value in hashes if isinstance(value, str) and value in valid][:limit]
|
| 233 |
+
except Exception:
|
| 234 |
+
return []
|
| 235 |
|
| 236 |
def ensure_page_metadata_for_user(self, *, db: Session, user: User) -> None:
|
| 237 |
docs = self.list_user_documents(db, user)
|
fly.toml
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
app = "docsbot-kbaba7"
|
| 2 |
-
primary_region = "bom"
|
| 3 |
-
|
| 4 |
-
[build]
|
| 5 |
-
dockerfile = "Dockerfile"
|
| 6 |
-
|
| 7 |
-
[env]
|
| 8 |
-
STORAGE_BACKEND = "supabase"
|
| 9 |
-
WEB_SEARCH_PROVIDER = "tavily"
|
| 10 |
-
|
| 11 |
-
[http_service]
|
| 12 |
-
internal_port = 8080
|
| 13 |
-
force_https = true
|
| 14 |
-
auto_stop_machines = "stop"
|
| 15 |
-
auto_start_machines = true
|
| 16 |
-
min_machines_running = 0
|
| 17 |
-
processes = ["app"]
|
| 18 |
-
|
| 19 |
-
[[vm]]
|
| 20 |
-
cpu_kind = "shared"
|
| 21 |
-
cpus = 1
|
| 22 |
-
memory_mb = 1024
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
render.yaml
DELETED
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
services:
|
| 2 |
-
- type: web
|
| 3 |
-
name: docsbot
|
| 4 |
-
runtime: python
|
| 5 |
-
plan: free
|
| 6 |
-
autoDeploy: true
|
| 7 |
-
buildCommand: pip install -e .
|
| 8 |
-
startCommand: uvicorn app.main:app --host 0.0.0.0 --port $PORT
|
| 9 |
-
healthCheckPath: /
|
| 10 |
-
envVars:
|
| 11 |
-
- key: PYTHON_VERSION
|
| 12 |
-
value: 3.12.2
|
| 13 |
-
- key: SECRET_KEY
|
| 14 |
-
sync: false
|
| 15 |
-
- key: DATABASE_URL
|
| 16 |
-
sync: false
|
| 17 |
-
- key: GROQ_API_KEY
|
| 18 |
-
sync: false
|
| 19 |
-
- key: STORAGE_BACKEND
|
| 20 |
-
value: supabase
|
| 21 |
-
- key: SUPABASE_URL
|
| 22 |
-
sync: false
|
| 23 |
-
- key: SUPABASE_SERVICE_ROLE_KEY
|
| 24 |
-
sync: false
|
| 25 |
-
- key: SUPABASE_STORAGE_BUCKET
|
| 26 |
-
value: documents
|
| 27 |
-
- key: SUPABASE_STORAGE_PREFIX
|
| 28 |
-
value: docsqa
|
| 29 |
-
- key: WEB_SEARCH_PROVIDER
|
| 30 |
-
value: duckduckgo
|
| 31 |
-
- key: TAVILY_API_KEY
|
| 32 |
-
sync: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|