Spaces:
Runtime error
Runtime error
| import re | |
| from collections import defaultdict | |
| from typing import List | |
| from rank_bm25 import BM25Okapi | |
| from sentence_transformers import CrossEncoder | |
| TOKEN_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_./:-]*") | |
| def tokenize(text: str) -> List[str]: | |
| return [token.lower() for token in TOKEN_RE.findall(text)] | |
| class HybridSearchEngine: | |
| def __init__(self, reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"): | |
| self.reranker = CrossEncoder(reranker_model) | |
| def build_for_repository(self, repo_id: int, chunks: List[dict]): | |
| return None | |
| def remove_repository(self, repo_id: int): | |
| return None | |
| def bm25_search(self, chunks: List[dict], query: str, top_k: int = 12) -> List[dict]: | |
| if not chunks: | |
| return [] | |
| tokens = tokenize(query) | |
| if not tokens: | |
| return [] | |
| corpus_tokens = [tokenize(chunk["searchable_text"]) for chunk in chunks] | |
| bm25 = BM25Okapi(corpus_tokens) if corpus_tokens else None | |
| if not bm25: | |
| return [] | |
| scores = bm25.get_scores(tokens) | |
| ranked = sorted( | |
| zip(chunks, scores), | |
| key=lambda item: item[1], | |
| reverse=True, | |
| )[:top_k] | |
| results = [] | |
| for rank, (chunk, score) in enumerate(ranked, start=1): | |
| chunk = dict(chunk) | |
| chunk["bm25_score"] = float(score) | |
| chunk["bm25_rank"] = rank | |
| results.append(chunk) | |
| return results | |
| def reciprocal_rank_fusion( | |
| self, | |
| lexical_results: List[dict], | |
| semantic_results: List[dict], | |
| top_k: int = 10, | |
| k: int = 60, | |
| ) -> List[dict]: | |
| fused = defaultdict(lambda: {"rrf_score": 0.0}) | |
| for rank, item in enumerate(lexical_results, start=1): | |
| fused[item["id"]]["rrf_score"] += 1.0 / (k + rank) | |
| fused[item["id"]].update(item) | |
| for rank, item in enumerate(semantic_results, start=1): | |
| fused[item["id"]]["rrf_score"] += 1.0 / (k + rank) | |
| fused[item["id"]].update(item) | |
| merged = sorted(fused.values(), key=lambda item: item["rrf_score"], reverse=True) | |
| return merged[:top_k] | |
| def rerank(self, query: str, candidates: List[dict], top_k: int = 6) -> List[dict]: | |
| """ | |
| FIX: top_k now defaults to 6 and callers should pass a small final number (4-6), | |
| NOT search_depth (which was up to 120). Reranking 120 items then dumping them | |
| all into the LLM context was the main faithfulness killer. | |
| """ | |
| if not candidates: | |
| return [] | |
| pairs = [ | |
| [query, f'{item["file_path"]}\n{item.get("signature") or ""}\n{item["content"]}'] | |
| for item in candidates | |
| ] | |
| scores = self.reranker.predict(pairs) | |
| reranked = [] | |
| for item, score in zip(candidates, scores): | |
| enriched = dict(item) | |
| enriched["rerank_score"] = float(score) | |
| reranked.append(enriched) | |
| reranked.sort(key=lambda item: item["rerank_score"], reverse=True) | |
| return reranked[:top_k] | |
| def normalize_semantic_results(results: List[dict]) -> List[dict]: | |
| normalized = [] | |
| for rank, item in enumerate(results, start=1): | |
| enriched = dict(item) | |
| enriched["semantic_rank"] = rank | |
| enriched["semantic_score"] = float(item.get("semantic_score", 0.0)) | |
| normalized.append(enriched) | |
| return normalized |