code-compass / src /hybrid_search.py
technophyle's picture
Sync from GitHub via hub-sync
35c1d2c verified
import re
from collections import defaultdict
from typing import List
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
TOKEN_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_./:-]*")
def tokenize(text: str) -> List[str]:
return [token.lower() for token in TOKEN_RE.findall(text)]
class HybridSearchEngine:
def __init__(self, reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
self.reranker = CrossEncoder(reranker_model)
def build_for_repository(self, repo_id: int, chunks: List[dict]):
return None
def remove_repository(self, repo_id: int):
return None
def bm25_search(self, chunks: List[dict], query: str, top_k: int = 12) -> List[dict]:
if not chunks:
return []
tokens = tokenize(query)
if not tokens:
return []
corpus_tokens = [tokenize(chunk["searchable_text"]) for chunk in chunks]
bm25 = BM25Okapi(corpus_tokens) if corpus_tokens else None
if not bm25:
return []
scores = bm25.get_scores(tokens)
ranked = sorted(
zip(chunks, scores),
key=lambda item: item[1],
reverse=True,
)[:top_k]
results = []
for rank, (chunk, score) in enumerate(ranked, start=1):
chunk = dict(chunk)
chunk["bm25_score"] = float(score)
chunk["bm25_rank"] = rank
results.append(chunk)
return results
def reciprocal_rank_fusion(
self,
lexical_results: List[dict],
semantic_results: List[dict],
top_k: int = 10,
k: int = 60,
) -> List[dict]:
fused = defaultdict(lambda: {"rrf_score": 0.0})
for rank, item in enumerate(lexical_results, start=1):
fused[item["id"]]["rrf_score"] += 1.0 / (k + rank)
fused[item["id"]].update(item)
for rank, item in enumerate(semantic_results, start=1):
fused[item["id"]]["rrf_score"] += 1.0 / (k + rank)
fused[item["id"]].update(item)
merged = sorted(fused.values(), key=lambda item: item["rrf_score"], reverse=True)
return merged[:top_k]
def rerank(self, query: str, candidates: List[dict], top_k: int = 6) -> List[dict]:
"""
FIX: top_k now defaults to 6 and callers should pass a small final number (4-6),
NOT search_depth (which was up to 120). Reranking 120 items then dumping them
all into the LLM context was the main faithfulness killer.
"""
if not candidates:
return []
pairs = [
[query, f'{item["file_path"]}\n{item.get("signature") or ""}\n{item["content"]}']
for item in candidates
]
scores = self.reranker.predict(pairs)
reranked = []
for item, score in zip(candidates, scores):
enriched = dict(item)
enriched["rerank_score"] = float(score)
reranked.append(enriched)
reranked.sort(key=lambda item: item["rerank_score"], reverse=True)
return reranked[:top_k]
@staticmethod
def normalize_semantic_results(results: List[dict]) -> List[dict]:
normalized = []
for rank, item in enumerate(results, start=1):
enriched = dict(item)
enriched["semantic_rank"] = rank
enriched["semantic_score"] = float(item.get("semantic_score", 0.0))
normalized.append(enriched)
return normalized