Spaces:

technophyle
/

code-compass

Runtime error

App Files Files Community

code-compass / src /hybrid_search.py

technophyle

Sync from GitHub via hub-sync

35c1d2c verified about 15 hours ago

raw

history blame contribute delete

3.5 kB

	import re
	from collections import defaultdict
	from typing import List

	from rank_bm25 import BM25Okapi
	from sentence_transformers import CrossEncoder

	TOKEN_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_./:-]*")


	def tokenize(text: str) -> List[str]:
	return [token.lower() for token in TOKEN_RE.findall(text)]


	class HybridSearchEngine:
	def __init__(self, reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
	self.reranker = CrossEncoder(reranker_model)

	def build_for_repository(self, repo_id: int, chunks: List[dict]):
	return None

	def remove_repository(self, repo_id: int):
	return None

	def bm25_search(self, chunks: List[dict], query: str, top_k: int = 12) -> List[dict]:
	if not chunks:
	return []
	tokens = tokenize(query)
	if not tokens:
	return []

	corpus_tokens = [tokenize(chunk["searchable_text"]) for chunk in chunks]
	bm25 = BM25Okapi(corpus_tokens) if corpus_tokens else None
	if not bm25:
	return []

	scores = bm25.get_scores(tokens)
	ranked = sorted(
	zip(chunks, scores),
	key=lambda item: item[1],
	reverse=True,
	)[:top_k]

	results = []
	for rank, (chunk, score) in enumerate(ranked, start=1):
	chunk = dict(chunk)
	chunk["bm25_score"] = float(score)
	chunk["bm25_rank"] = rank
	results.append(chunk)
	return results

	def reciprocal_rank_fusion(
	self,
	lexical_results: List[dict],
	semantic_results: List[dict],
	top_k: int = 10,
	k: int = 60,
	) -> List[dict]:
	fused = defaultdict(lambda: {"rrf_score": 0.0})

	for rank, item in enumerate(lexical_results, start=1):
	fused[item["id"]]["rrf_score"] += 1.0 / (k + rank)
	fused[item["id"]].update(item)

	for rank, item in enumerate(semantic_results, start=1):
	fused[item["id"]]["rrf_score"] += 1.0 / (k + rank)
	fused[item["id"]].update(item)

	merged = sorted(fused.values(), key=lambda item: item["rrf_score"], reverse=True)
	return merged[:top_k]

	def rerank(self, query: str, candidates: List[dict], top_k: int = 6) -> List[dict]:
	"""
	FIX: top_k now defaults to 6 and callers should pass a small final number (4-6),
	NOT search_depth (which was up to 120). Reranking 120 items then dumping them
	all into the LLM context was the main faithfulness killer.
	"""
	if not candidates:
	return []

	pairs = [
	[query, f'{item["file_path"]}\n{item.get("signature") or ""}\n{item["content"]}']
	for item in candidates
	]
	scores = self.reranker.predict(pairs)

	reranked = []
	for item, score in zip(candidates, scores):
	enriched = dict(item)
	enriched["rerank_score"] = float(score)
	reranked.append(enriched)

	reranked.sort(key=lambda item: item["rerank_score"], reverse=True)
	return reranked[:top_k]

	@staticmethod
	def normalize_semantic_results(results: List[dict]) -> List[dict]:
	normalized = []
	for rank, item in enumerate(results, start=1):
	enriched = dict(item)
	enriched["semantic_rank"] = rank
	enriched["semantic_score"] = float(item.get("semantic_score", 0.0))
	normalized.append(enriched)
	return normalized