Spaces:

technophyle
/

code-compass

Sleeping

App Files Files Community

technophyle commited on 5 days ago

Commit

60b97da

verified ·

1 Parent(s): 9d09b0a

Sync from GitHub via hub-sync

Browse files

Files changed (16) hide show

.dockerignore +16 -0
Dockerfile +17 -0
README.md +16 -5
evals/run_eval.py +688 -0
evals/sample_eval_set.json +673 -0
requirements.txt +20 -0
server_app.py +138 -0
src/__init__.py +20 -0
src/code_parser.py +223 -0
src/database.py +143 -0
src/document_processor.py +74 -0
src/embeddings.py +294 -0
src/hybrid_search.py +102 -0
src/rag_system.py +1145 -0
src/repo_fetcher.py +164 -0
src/vector_store.py +150 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,16 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.venv/
+venv/
+.env
+.git/
+.gitignore
+*.db
+faiss/
+uploads/
+temp_uploads/
+data/
+rag_system.db

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.11-slim
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+ && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+COPY . /app
+ENV PYTHONUNBUFFERED=1
+EXPOSE 7860
+CMD ["uvicorn", "server_app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,21 @@
 ---
-title: Code Compass
-emoji: 📈
 colorFrom: blue
-colorTo: gray
 sdk: docker
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Code Compass API
+emoji: 🚀
 colorFrom: blue
+colorTo: indigo
 sdk: docker
+app_port: 7860
 ---
+# Code Compass Backend
+FastAPI backend for a session-oriented GitHub repo QA tool.
+Behavior:
+- Clones a public GitHub repo
+- Chunks it with tree-sitter
+- Builds retrieval state with a Qdrant adapter
+- Answers questions with Groq-hosted Llama or Vertex AI Gemini depending on environment configuration
+- Deletes the cloned repo after indexing
+- Keeps only lightweight repo metadata in SQLite

evals/run_eval.py ADDED Viewed

	@@ -0,0 +1,688 @@

+import json
+import os
+import sys
+import asyncio
+import re
+from pathlib import Path
+from collections import Counter, defaultdict
+from statistics import mean
+import requests
+from dotenv import load_dotenv
+SERVER_ROOT = Path(__file__).resolve().parents[1]
+if str(SERVER_ROOT) not in sys.path:
+    sys.path.insert(0, str(SERVER_ROOT))
+load_dotenv(SERVER_ROOT / ".env")
+from src.embeddings import EmbeddingGenerator
+API_URL = os.getenv("CODEBASE_RAG_API_URL", "http://localhost:8000")
+REPO_ID = int(os.getenv("CODEBASE_RAG_REPO_ID", "1"))
+SESSION_ID = os.getenv("CODEBASE_RAG_SESSION_ID", "eval-session")
+TOP_K = int(os.getenv("CODEBASE_RAG_TOP_K", "8"))
+QUERY_TIMEOUT_SECONDS = int(os.getenv("CODEBASE_RAG_QUERY_TIMEOUT_SECONDS", "180"))
+ENABLE_RAGAS = os.getenv("CODEBASE_RAG_ENABLE_RAGAS", "1").lower() not in {"0", "false", "no"}
+RAGAS_ASYNC = os.getenv("CODEBASE_RAG_RAGAS_ASYNC", "0").lower() in {"1", "true", "yes"}
+RAGAS_RAISE_EXCEPTIONS = os.getenv("CODEBASE_RAG_RAGAS_RAISE_EXCEPTIONS", "0").lower() in {
+    "1",
+    "true",
+    "yes",
+}
+EVAL_SET_PATH = Path(
+    os.getenv(
+        "CODEBASE_RAG_EVAL_SET",
+        Path(__file__).with_name("sample_eval_set.json"),
+    )
+)
+def log(message: str):
+    print(f"[eval] {message}", file=sys.stderr, flush=True)
+def load_eval_rows():
+    return json.loads(EVAL_SET_PATH.read_text())
+def post_query(row):
+    payload = {
+        "repo_id": REPO_ID,
+        "question": row["question"],
+        "top_k": TOP_K,
+        "history": row.get("turns", []),
+    }
+    response = requests.post(
+        f"{API_URL}/api/query",
+        json=payload,
+        headers={"X-Session-Id": SESSION_ID},
+        timeout=QUERY_TIMEOUT_SECONDS,
+    )
+    if not response.ok:
+        detail = response.text
+        try:
+            parsed = response.json()
+            detail = parsed.get("detail") or parsed
+        except Exception:
+            pass
+        raise RuntimeError(
+            f"Query failed for eval case {row.get('id', row['question'])!r} "
+            f"with status {response.status_code}: {detail}"
+        )
+    return response.json()
+def normalize_path(path: str) -> str:
+    return path.strip().lstrip("./").lower()
+STOPWORDS = {
+    "a",
+    "an",
+    "and",
+    "are",
+    "as",
+    "at",
+    "be",
+    "by",
+    "for",
+    "from",
+    "how",
+    "in",
+    "into",
+    "is",
+    "it",
+    "its",
+    "of",
+    "on",
+    "or",
+    "that",
+    "the",
+    "their",
+    "this",
+    "to",
+    "what",
+    "when",
+    "where",
+    "which",
+    "with",
+}
+def tokenize_text(text: str):
+    return re.findall(r"[a-z0-9_./+-]+", (text or "").lower())
+def compute_retrieval_metrics(expected_sources, actual_sources):
+    expected = {normalize_path(path) for path in expected_sources}
+    actual = [normalize_path(path) for path in actual_sources]
+    unique_actual = list(dict.fromkeys(actual))
+    def matches_expected(actual_path: str) -> bool:
+        for expected_path in expected:
+            if actual_path == expected_path:
+                return True
+            if "/" not in expected_path and actual_path.startswith(expected_path.rstrip("/") + "/"):
+                return True
+        return False
+    hit = 1 if any(matches_expected(path) for path in actual) else 0
+    recall = 0.0
+    if expected:
+        matched_expected = set()
+        for expected_path in expected:
+            for actual_path in actual:
+                if actual_path == expected_path or (
+                    "/" not in expected_path and actual_path.startswith(expected_path.rstrip("/") + "/")
+                ):
+                    matched_expected.add(expected_path)
+                    break
+        recall = len(matched_expected) / len(expected)
+    mrr = 0.0
+    for index, path in enumerate(actual, start=1):
+        if matches_expected(path):
+            mrr = 1.0 / index
+            break
+    return {
+        "retrieval_hit": hit,
+        "source_recall": recall,
+        "mrr": mrr,
+        "top1_hit": 1 if actual and matches_expected(actual[0]) else 0,
+        "unique_source_precision": (
+            sum(1 for path in unique_actual if matches_expected(path)) / len(unique_actual)
+            if unique_actual
+            else 0.0
+        ),
+        "duplicate_source_rate": (
+            (len(actual) - len(unique_actual)) / len(actual)
+            if actual
+            else 0.0
+        ),
+    }
+def keyword_match_ratio(row, answer: str):
+    keywords = [keyword.lower() for keyword in row.get("must_include_any", []) if keyword.strip()]
+    if not keywords:
+        return None
+    lowered = answer.lower()
+    matched = sum(1 for keyword in keywords if keyword in lowered)
+    return matched / len(keywords)
+def keyword_pass(row, answer: str, coverage: float | None):
+    if coverage is None:
+        return None
+    minimum = int(row.get("min_keyword_matches", 1))
+    keywords = [keyword for keyword in row.get("must_include_any", []) if str(keyword).strip()]
+    if not keywords:
+        return None
+    matched = round(coverage * len(keywords))
+    return 1 if matched >= minimum else 0
+def answer_length_metrics(answer: str):
+    tokens = tokenize_text(answer)
+    return {
+        "answer_word_count": len(tokens),
+        "has_substantive_answer": 1 if len(tokens) >= 40 else 0,
+    }
+def lexical_overlap_ratio(reference: str, candidate: str):
+    reference_terms = {
+        token for token in tokenize_text(reference)
+        if len(token) > 2 and token not in STOPWORDS
+    }
+    if not reference_terms:
+        return None
+    candidate_terms = set(tokenize_text(candidate))
+    matched = sum(1 for token in reference_terms if token in candidate_terms)
+    return matched / len(reference_terms)
+def validate_eval_rows(rows):
+    errors = []
+    warnings = []
+    category_counts = Counter()
+    id_counts = Counter()
+    id_prefix_counts = Counter()
+    expected_source_counts = []
+    keyword_counts = []
+    conversation_cases = 0
+    for index, row in enumerate(rows, start=1):
+        row_id = row.get("id") or f"row-{index}"
+        id_counts[row_id] += 1
+        prefix = row_id.split("-", 1)[0].lower()
+        if prefix:
+            id_prefix_counts[prefix] += 1
+        category_counts[row.get("category", "general")] += 1
+        question = str(row.get("question", "")).strip()
+        ground_truth = str(row.get("ground_truth", "")).strip()
+        expected_sources = row.get("expected_sources", [])
+        must_include_any = row.get("must_include_any", [])
+        if not question:
+            errors.append(f"{row_id}: missing question")
+        if not ground_truth:
+            errors.append(f"{row_id}: missing ground_truth")
+        if not isinstance(expected_sources, list) or not expected_sources:
+            errors.append(f"{row_id}: expected_sources must be a non-empty list")
+        if must_include_any and not isinstance(must_include_any, list):
+            errors.append(f"{row_id}: must_include_any must be a list when present")
+        if row.get("turns"):
+            conversation_cases += 1
+        expected_source_counts.append(len(expected_sources) if isinstance(expected_sources, list) else 0)
+        keyword_counts.append(len(must_include_any) if isinstance(must_include_any, list) else 0)
+    duplicate_ids = sorted(row_id for row_id, count in id_counts.items() if count > 1)
+    if duplicate_ids:
+        errors.append(f"duplicate ids found: {', '.join(duplicate_ids)}")
+    if len(rows) < 25:
+        warnings.append(
+            "Eval set has fewer than 25 cases. Good for iteration, but light for resume-grade benchmarking."
+        )
+    if len(category_counts) < 4:
+        warnings.append("Eval set covers fewer than 4 categories, so breadth is limited.")
+    if conversation_cases < 2:
+        warnings.append("Eval set has very little multi-turn coverage.")
+    if category_counts and min(category_counts.values()) < 2:
+        sparse = sorted(category for category, count in category_counts.items() if count < 2)
+        warnings.append(f"Some categories are underrepresented: {', '.join(sparse)}.")
+    if id_prefix_counts:
+        dominant_prefix, dominant_count = id_prefix_counts.most_common(1)[0]
+        if dominant_count / len(rows) >= 0.8:
+            warnings.append(
+                f"Most cases share the same id prefix ({dominant_prefix}), which suggests a benchmark focused on one target project."
+            )
+    return {
+        "case_count": len(rows),
+        "category_counts": dict(sorted(category_counts.items())),
+        "conversation_case_count": conversation_cases,
+        "average_expected_sources": round(mean(expected_source_counts), 2) if expected_source_counts else 0.0,
+        "average_keywords_per_case": round(mean(keyword_counts), 2) if keyword_counts else 0.0,
+        "errors": errors,
+        "warnings": warnings,
+        "is_valid": not errors,
+    }
+def summarize_custom_metrics(details):
+    keyword_coverages = [item["keyword_coverage"] for item in details if item["keyword_coverage"] is not None]
+    keyword_passes = [item["keyword_pass"] for item in details if item["keyword_pass"] is not None]
+    grounded_answer_passes = [
+        1
+        for item in details
+        if item["retrieval_hit"] == 1
+        and item["has_substantive_answer"] == 1
+        and (item["keyword_pass"] in {None, 1})
+    ]
+    exact_source_recall_cases = [1 for item in details if item["source_recall"] == 1.0]
+    return {
+        "retrieval_hit_rate": round(mean(item["retrieval_hit"] for item in details), 4),
+        "top1_hit_rate": round(mean(item["top1_hit"] for item in details), 4),
+        "source_recall": round(mean(item["source_recall"] for item in details), 4),
+        "mrr": round(mean(item["mrr"] for item in details), 4),
+        "unique_source_precision": round(mean(item["unique_source_precision"] for item in details), 4),
+        "duplicate_source_rate": round(mean(item["duplicate_source_rate"] for item in details), 4),
+        "keyword_coverage": round(mean(keyword_coverages), 4) if keyword_coverages else None,
+        "keyword_pass_rate": round(mean(keyword_passes), 4) if keyword_passes else None,
+        "ground_truth_lexical_overlap": round(
+            mean(item["ground_truth_lexical_overlap"] for item in details if item["ground_truth_lexical_overlap"] is not None),
+            4,
+        )
+        if any(item["ground_truth_lexical_overlap"] is not None for item in details)
+        else None,
+        "substantive_answer_rate": round(mean(item["has_substantive_answer"] for item in details), 4),
+        "grounded_answer_rate": round(sum(grounded_answer_passes) / len(details), 4) if details else 0.0,
+        "exact_source_recall_rate": round(sum(exact_source_recall_cases) / len(details), 4) if details else 0.0,
+    }
+def summarize_by_category(details):
+    grouped = defaultdict(list)
+    for item in details:
+        grouped[item["category"]].append(item)
+    summary = {}
+    for category, items in sorted(grouped.items()):
+        keyword_passes = [item["keyword_pass"] for item in items if item["keyword_pass"] is not None]
+        summary[category] = {
+            "case_count": len(items),
+            "retrieval_hit_rate": round(mean(item["retrieval_hit"] for item in items), 4),
+            "top1_hit_rate": round(mean(item["top1_hit"] for item in items), 4),
+            "source_recall": round(mean(item["source_recall"] for item in items), 4),
+            "mrr": round(mean(item["mrr"] for item in items), 4),
+            "keyword_pass_rate": round(mean(keyword_passes), 4) if keyword_passes else None,
+            "grounded_answer_rate": round(
+                mean(
+                    1
+                    if item["retrieval_hit"] == 1 and item["has_substantive_answer"] == 1 and item["keyword_pass"] in {None, 1}
+                    else 0
+                    for item in items
+                ),
+                4,
+            ),
+        }
+    return summary
+def build_headline_metrics(custom_metrics, audit):
+    return {
+        "sample_size": audit["case_count"],
+        "category_count": len(audit["category_counts"]),
+        "retrieval_hit_rate": custom_metrics["retrieval_hit_rate"],
+        "top1_hit_rate": custom_metrics["top1_hit_rate"],
+        "mrr": custom_metrics["mrr"],
+        "source_recall": custom_metrics["source_recall"],
+        "grounded_answer_rate": custom_metrics["grounded_answer_rate"],
+        "keyword_pass_rate": custom_metrics["keyword_pass_rate"],
+    }
+def build_resume_summary(custom_metrics, audit, ragas_report, ragas_error):
+    lines = [
+        (
+            f"Evaluated on {audit['case_count']} repo-QA cases across "
+            f"{len(audit['category_counts'])} categories."
+        ),
+        (
+            f"Deterministic retrieval metrics: hit@{TOP_K} {custom_metrics['retrieval_hit_rate']:.1%}, "
+            f"top-1 hit {custom_metrics['top1_hit_rate']:.1%}, MRR {custom_metrics['mrr']:.3f}, "
+            f"source recall {custom_metrics['source_recall']:.1%}."
+        ),
+        (
+            f"Answer quality checks: grounded answer rate {custom_metrics['grounded_answer_rate']:.1%}"
+            + (
+                f", keyword/checklist pass rate {custom_metrics['keyword_pass_rate']:.1%}."
+                if custom_metrics["keyword_pass_rate"] is not None
+                else "."
+            )
+        ),
+    ]
+    if ragas_report and not ragas_error:
+        lines.append(
+            "LLM-judge metrics (supporting signal, not primary headline): "
+            f"faithfulness {ragas_report.get('faithfulness', 0.0):.3f}, "
+            f"answer relevancy {ragas_report.get('answer_relevancy', 0.0):.3f}, "
+            f"context precision {ragas_report.get('context_precision', 0.0):.3f}."
+        )
+    else:
+        lines.append("LLM-judge metrics were skipped or unstable, so headline metrics rely on deterministic checks.")
+    if audit["warnings"]:
+        lines.append(
+            "Benchmark caveat: "
+            + " ".join(audit["warnings"][:2])
+        )
+    return " ".join(lines)
+def benchmark_readiness(audit, ragas_error):
+    reasons = []
+    if audit["case_count"] < 25:
+        reasons.append("small_sample")
+    if len(audit["category_counts"]) < 4:
+        reasons.append("limited_category_coverage")
+    if audit["conversation_case_count"] < 2:
+        reasons.append("limited_multi_turn_coverage")
+    if audit["warnings"]:
+        reasons.append("dataset_scope_warnings")
+    if ragas_error not in {None, "disabled"}:
+        reasons.append("ragas_instability")
+    if reasons:
+        return {
+            "status": "internal_or_demo_benchmark",
+            "reasons": reasons,
+        }
+    return {
+        "status": "presentation_ready",
+        "reasons": [],
+    }
+def maybe_write_report(report):
+    output_path = os.getenv("CODEBASE_RAG_EVAL_OUTPUT")
+    if not output_path:
+        return None
+    target = Path(output_path)
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_text(json.dumps(report, indent=2))
+    return str(target)
+def build_vertex_ragas_llm(run_config):
+    from google import genai
+    from langchain_core.outputs import Generation, LLMResult
+    from ragas.llms.base import BaseRagasLLM
+    class VertexRagasLLM(BaseRagasLLM):
+        def __init__(self, model: str, project: str, location: str, run_config):
+            self.client = genai.Client(
+                vertexai=True,
+                project=project,
+                location=location,
+            )
+            self.model = model
+            self.set_run_config(run_config)
+        def _prompt_to_text(self, prompt):
+            prefix = (
+                "Return only valid JSON or the exact structured output requested by the prompt. "
+                "Do not add markdown fences, explanations, or extra prose.\n\n"
+            )
+            if hasattr(prompt, "to_string"):
+                return prefix + prompt.to_string()
+            return prefix + str(prompt)
+        def _generate_once(self, prompt, n=1, temperature=1e-8, stop=None, callbacks=None):
+            prompt_text = self._prompt_to_text(prompt)
+            config = {
+                "temperature": 0.0,
+                "candidate_count": max(1, n),
+                "max_output_tokens": int(os.getenv("EVAL_MAX_OUTPUT_TOKENS", "2048")),
+                "response_mime_type": "application/json",
+            }
+            if stop:
+                config["stop_sequences"] = stop
+            response = self.client.models.generate_content(
+                model=self.model,
+                contents=prompt_text,
+                config=config,
+            )
+            candidates = getattr(response, "candidates", None) or []
+            generations = []
+            if candidates:
+                for candidate in candidates[: max(1, n)]:
+                    text = getattr(candidate, "text", None)
+                    if text is None and hasattr(candidate, "content"):
+                        parts = getattr(candidate.content, "parts", None) or []
+                        text = "".join(getattr(part, "text", "") for part in parts if getattr(part, "text", ""))
+                    generations.append(Generation(text=(text or "").strip()))
+            elif getattr(response, "text", None):
+                generations.append(Generation(text=response.text.strip()))
+            if not generations:
+                raise RuntimeError("Vertex AI judge returned an empty response.")
+            return LLMResult(generations=[generations])
+        def generate_text(self, prompt, n=1, temperature=1e-8, stop=None, callbacks=None):
+            return self._generate_once(
+                prompt=prompt,
+                n=n,
+                temperature=temperature,
+                stop=stop,
+                callbacks=callbacks,
+            )
+        async def agenerate_text(self, prompt, n=1, temperature=1e-8, stop=None, callbacks=None):
+            return await asyncio.to_thread(
+                self._generate_once,
+                prompt,
+                n,
+                temperature,
+                stop,
+                callbacks,
+            )
+    project = os.getenv("GOOGLE_CLOUD_PROJECT")
+    location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
+    model = os.getenv("EVAL_MODEL", os.getenv("VERTEX_LLM_MODEL", "gemini-2.5-pro"))
+    if not project:
+        raise RuntimeError("GOOGLE_CLOUD_PROJECT must be set for Vertex AI RAGAS evaluation.")
+    return VertexRagasLLM(model=model, project=project, location=location, run_config=run_config)
+def build_ragas_embeddings(run_config):
+    from ragas.embeddings.base import BaseRagasEmbeddings
+    class AppEmbeddingWrapper(BaseRagasEmbeddings):
+        def __init__(self, generator, run_config):
+            self.generator = generator
+            self.set_run_config(run_config)
+        def embed_query(self, text):
+            return self.generator.embed_text(text).tolist()
+        def embed_documents(self, texts):
+            vectors = self.generator.embed_batch(list(texts))
+            return vectors.tolist()
+        async def aembed_query(self, text):
+            return await asyncio.to_thread(self.embed_query, text)
+        async def aembed_documents(self, texts):
+            return await asyncio.to_thread(self.embed_documents, texts)
+    return AppEmbeddingWrapper(EmbeddingGenerator(), run_config=run_config)
+def run_ragas(rows, outputs):
+    if not ENABLE_RAGAS:
+        log("RAGAS disabled via CODEBASE_RAG_ENABLE_RAGAS=0. Reporting custom metrics only.")
+        return None, "disabled"
+    try:
+        from datasets import Dataset
+        from ragas import evaluate
+        from ragas.metrics import answer_relevancy, context_precision, faithfulness
+        from ragas.run_config import RunConfig
+    except Exception as exc:
+        log(f"Skipping RAGAS because the evaluation dependencies could not be loaded: {exc}")
+        return None, f"import_error: {exc}"
+    def build_ragas_dataset():
+        samples = []
+        for row, result in zip(rows, outputs):
+            samples.append(
+                {
+                    "question": row["question"],
+                    "answer": result["answer"],
+                    "contexts": [source["snippet"] for source in result.get("sources", [])],
+                    "ground_truth": row["ground_truth"],
+                }
+            )
+        return Dataset.from_list(samples)
+    log("Running RAGAS metrics. This can take a while.")
+    try:
+        timeout_seconds = int(os.getenv("EVAL_TIMEOUT_SECONDS", "180"))
+        thread_timeout_seconds = float(os.getenv("EVAL_THREAD_TIMEOUT_SECONDS", str(max(timeout_seconds, 240))))
+        max_workers = int(os.getenv("EVAL_MAX_WORKERS", "4"))
+        run_config = RunConfig(
+            timeout=timeout_seconds,
+            thread_timeout=thread_timeout_seconds,
+            max_workers=max_workers,
+            max_retries=int(os.getenv("EVAL_MAX_RETRIES", "3")),
+            max_wait=int(os.getenv("EVAL_MAX_WAIT_SECONDS", "60")),
+        )
+        log(
+            "Using Vertex AI for RAGAS judge model "
+            f"({os.getenv('EVAL_MODEL', os.getenv('VERTEX_LLM_MODEL', 'gemini-2.5-pro'))})"
+        )
+        log(
+            f"RAGAS runtime: async={RAGAS_ASYNC}, raise_exceptions={RAGAS_RAISE_EXCEPTIONS}, "
+            f"timeout={timeout_seconds}s, thread_timeout={thread_timeout_seconds}s, max_workers={max_workers}"
+        )
+        llm = build_vertex_ragas_llm(run_config)
+        embeddings = build_ragas_embeddings(run_config)
+        ragas_report = evaluate(
+            build_ragas_dataset(),
+            metrics=[faithfulness, answer_relevancy, context_precision],
+            llm=llm,
+            embeddings=embeddings,
+            run_config=run_config,
+            is_async=RAGAS_ASYNC,
+            raise_exceptions=RAGAS_RAISE_EXCEPTIONS,
+        )
+        return {key: float(value) for key, value in ragas_report.items()}, None
+    except Exception as exc:
+        log(f"RAGAS evaluation failed: {exc}")
+        return None, str(exc)
+def run():
+    log(f"Loading eval set from {EVAL_SET_PATH}")
+    rows = load_eval_rows()
+    audit = validate_eval_rows(rows)
+    if audit["errors"]:
+        raise RuntimeError("Eval set validation failed: " + "; ".join(audit["errors"]))
+    for warning in audit["warnings"]:
+        log(f"Eval set warning: {warning}")
+    log(
+        f"Starting eval with api_url={API_URL}, repo_id={REPO_ID}, "
+        f"session_id={SESSION_ID}, top_k={TOP_K}, cases={len(rows)}"
+    )
+    outputs = []
+    details = []
+    for index, row in enumerate(rows, start=1):
+        case_id = row.get("id", row["question"])
+        log(f"[{index}/{len(rows)}] Querying case {case_id}")
+        result = post_query(row)
+        outputs.append(result)
+        log(
+            f"[{index}/{len(rows)}] Received answer for {case_id} "
+            f"with {len(result.get('sources', []))} sources"
+        )
+        cited_paths = [source["file_path"] for source in result.get("sources", [])]
+        metrics = compute_retrieval_metrics(row.get("expected_sources", []), cited_paths)
+        keyword_coverage = keyword_match_ratio(row, result.get("answer", ""))
+        keyword_gate = keyword_pass(row, result.get("answer", ""), keyword_coverage)
+        length_metrics = answer_length_metrics(result.get("answer", ""))
+        overlap = lexical_overlap_ratio(row.get("ground_truth", ""), result.get("answer", ""))
+        details.append(
+            {
+                "id": row.get("id", row["question"]),
+                "category": row.get("category", "general"),
+                "question": row["question"],
+                "answer": result.get("answer", ""),
+                "expected_sources": row.get("expected_sources", []),
+                "retrieved_sources": cited_paths,
+                "retrieval_hit": metrics["retrieval_hit"],
+                "source_recall": metrics["source_recall"],
+                "mrr": metrics["mrr"],
+                "top1_hit": metrics["top1_hit"],
+                "unique_source_precision": metrics["unique_source_precision"],
+                "duplicate_source_rate": metrics["duplicate_source_rate"],
+                "keyword_coverage": keyword_coverage,
+                "keyword_pass": keyword_gate,
+                "ground_truth_lexical_overlap": overlap,
+                **length_metrics,
+            }
+        )
+    log("Finished query loop. Computing aggregate metrics.")
+    custom_metrics = summarize_custom_metrics(details)
+    category_breakdown = summarize_by_category(details)
+    ragas_report, ragas_error = run_ragas(rows, outputs)
+    headline_metrics = build_headline_metrics(custom_metrics, audit)
+    resume_summary = build_resume_summary(custom_metrics, audit, ragas_report, ragas_error)
+    readiness = benchmark_readiness(audit, ragas_error)
+    report = {
+        "config": {
+            "api_url": API_URL,
+            "repo_id": REPO_ID,
+            "session_id": SESSION_ID,
+            "top_k": TOP_K,
+            "query_timeout_seconds": QUERY_TIMEOUT_SECONDS,
+            "eval_set": str(EVAL_SET_PATH),
+        },
+        "eval_set_audit": audit,
+        "headline_metrics": headline_metrics,
+        "benchmark_readiness": readiness,
+        "ragas": ragas_report,
+        "ragas_error": ragas_error,
+        "custom_metrics": custom_metrics,
+        "category_breakdown": category_breakdown,
+        "resume_summary": resume_summary,
+        "cases": details,
+    }
+    output_path = maybe_write_report(report)
+    if output_path:
+        log(f"Wrote JSON report to {output_path}")
+    log("Eval complete. Printing JSON report.")
+    print(json.dumps(report, indent=2))
+if __name__ == "__main__":
+    run()

evals/sample_eval_set.json ADDED Viewed

	@@ -0,0 +1,673 @@

+[
+  {
+    "id": "sqlmodel-purpose",
+    "category": "architecture",
+    "question": "What is SQLModel and how is it positioned relative to Pydantic and SQLAlchemy?",
+    "ground_truth": "SQLModel is a thin layer designed to combine Pydantic-style data modeling with SQLAlchemy ORM and SQL expression features. The project presents itself as a library for SQL databases in Python that emphasizes simplicity, compatibility, and robustness while being built on top of Pydantic and SQLAlchemy.",
+    "expected_sources": [
+      "README.md",
+      "sqlmodel/__init__.py",
+      "sqlmodel/main.py"
+    ],
+    "must_include_any": [
+      "Pydantic",
+      "SQLAlchemy",
+      "thin layer"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-core-model-class",
+    "category": "architecture",
+    "question": "Where is the core SQLModel base class defined and what is its role?",
+    "ground_truth": "The SQLModel base class is defined in sqlmodel/main.py. It acts as the main model base that bridges typed field definitions, Pydantic-compatible validation behavior, and SQLAlchemy table or ORM metadata.",
+    "expected_sources": [
+      "sqlmodel/main.py"
+    ],
+    "must_include_any": [
+      "SQLModel",
+      "base class",
+      "Pydantic",
+      "SQLAlchemy"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-field-helper",
+    "category": "architecture",
+    "question": "How does SQLModel expose field declarations for model attributes?",
+    "ground_truth": "SQLModel exposes a Field helper in sqlmodel/main.py and re-exports it at the package level. Field collects model metadata such as defaults, primary key flags, indexes, foreign keys, nullability, and other column-related settings used when building SQL-backed models.",
+    "expected_sources": [
+      "sqlmodel/main.py",
+      "sqlmodel/__init__.py"
+    ],
+    "must_include_any": [
+      "Field",
+      "primary key",
+      "foreign key",
+      "re-export"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-relationship-helper",
+    "category": "architecture",
+    "question": "How are relationships modeled in SQLModel?",
+    "ground_truth": "Relationships are declared through the Relationship helper and associated metadata in sqlmodel/main.py. SQLModel captures relationship configuration separately from normal field definitions so relationship behavior can be translated into SQLAlchemy ORM relationship setup.",
+    "expected_sources": [
+      "sqlmodel/main.py",
+      "sqlmodel/__init__.py"
+    ],
+    "must_include_any": [
+      "Relationship",
+      "relationship",
+      "SQLAlchemy",
+      "metadata"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-field-function",
+    "category": "specific-function",
+    "question": "What does the Field() function do in SQLModel?",
+    "ground_truth": "Field defines metadata for a model attribute, including validation defaults and SQL column configuration such as primary_key, foreign_key, index, nullable, sa_type, or sa_column options. SQLModel uses that metadata when constructing models that can also map to tables.",
+    "expected_sources": [
+      "sqlmodel/main.py"
+    ],
+    "must_include_any": [
+      "Field",
+      "primary_key",
+      "nullable",
+      "column"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-relationship-function",
+    "category": "specific-function",
+    "question": "What does Relationship() do in SQLModel?",
+    "ground_truth": "Relationship captures relationship-specific configuration for ORM links between models, such as back_populates and SQLAlchemy relationship arguments. It provides structured metadata that SQLModel can later translate into SQLAlchemy relationship objects.",
+    "expected_sources": [
+      "sqlmodel/main.py"
+    ],
+    "must_include_any": [
+      "Relationship",
+      "back_populates",
+      "relationship",
+      "metadata"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-session-exec",
+    "category": "specific-function",
+    "question": "What is special about Session.exec() in SQLModel?",
+    "ground_truth": "SQLModel provides a Session class with an exec helper that offers a friendlier typed wrapper around SQLAlchemy execution patterns, especially for SQLModel select statements. It is intended to make common query execution more ergonomic than raw SQLAlchemy session.execute calls.",
+    "expected_sources": [
+      "sqlmodel/orm/session.py",
+      "sqlmodel/__init__.py"
+    ],
+    "must_include_any": [
+      "Session",
+      "exec",
+      "execute",
+      "typed"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-async-session-exec",
+    "category": "specific-function",
+    "question": "How does async query execution work in SQLModel?",
+    "ground_truth": "SQLModel provides async session support under sqlmodel.ext.asyncio.session, including an async session wrapper that supports exec-style query execution for SQLModel statements in asynchronous applications.",
+    "expected_sources": [
+      "sqlmodel/ext/asyncio/session.py"
+    ],
+    "must_include_any": [
+      "async",
+      "AsyncSession",
+      "exec",
+      "greenlet"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-select-export",
+    "category": "specific-function",
+    "question": "How is select exposed to users in SQLModel?",
+    "ground_truth": "SQLModel re-exports a select helper from its SQL expression layer so users can write typed select statements directly from the sqlmodel package instead of importing SQLAlchemy primitives manually.",
+    "expected_sources": [
+      "sqlmodel/__init__.py",
+      "sqlmodel/sql/expression.py"
+    ],
+    "must_include_any": [
+      "select",
+      "re-export",
+      "expression",
+      "sqlmodel"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-create-engine-export",
+    "category": "specific-function",
+    "question": "How does SQLModel expose create_engine to application code?",
+    "ground_truth": "SQLModel re-exports create_engine from SQLAlchemy at the package level so users can import it directly from sqlmodel while using SQLModel models and sessions together.",
+    "expected_sources": [
+      "sqlmodel/__init__.py"
+    ],
+    "must_include_any": [
+      "create_engine",
+      "re-export",
+      "SQLAlchemy",
+      "sqlmodel"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-metadata-create-all",
+    "category": "config-setup",
+    "question": "How are database tables created when using SQLModel?",
+    "ground_truth": "Table creation typically happens by calling SQLModel.metadata.create_all(engine). SQLModel models register table metadata in a way that allows SQLAlchemy metadata creation workflows to build the underlying database tables.",
+    "expected_sources": [
+      "README.md",
+      "sqlmodel/main.py",
+      "docs_src"
+    ],
+    "must_include_any": [
+      "metadata",
+      "create_all",
+      "engine",
+      "table"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-package-exports",
+    "category": "config-setup",
+    "question": "What does sqlmodel.__init__ export for end users?",
+    "ground_truth": "The package initializer re-exports core user-facing APIs from SQLAlchemy and SQLModel, including create_engine, Session, SQLModel, Field, Relationship, and select-related helpers so application code can import most common primitives directly from sqlmodel.",
+    "expected_sources": [
+      "sqlmodel/__init__.py"
+    ],
+    "must_include_any": [
+      "Session",
+      "SQLModel",
+      "Field",
+      "create_engine"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-readme-basic-flow",
+    "category": "config-setup",
+    "question": "What basic database workflow does the README show for SQLModel?",
+    "ground_truth": "The README demonstrates defining a SQLModel table model, creating an engine, creating tables with metadata.create_all, opening a Session, inserting rows, committing, and then selecting rows with select and session.exec.",
+    "expected_sources": [
+      "README.md"
+    ],
+    "must_include_any": [
+      "create_engine",
+      "Session",
+      "create_all",
+      "select"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-column-options-errors",
+    "category": "error-handling",
+    "question": "How does SQLModel guard against conflicting or invalid Field configuration?",
+    "ground_truth": "SQLModel performs validation around Field configuration in its core model code and raises errors when incompatible options are combined or when SQLAlchemy-specific arguments conflict with other field settings.",
+    "expected_sources": [
+      "sqlmodel/main.py"
+    ],
+    "must_include_any": [
+      "raise",
+      "Field",
+      "conflict",
+      "sa_column"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-relationship-errors",
+    "category": "error-handling",
+    "question": "Where would SQLModel enforce invalid relationship configuration?",
+    "ground_truth": "Relationship configuration is handled in the core SQLModel model layer, where relationship metadata is collected and incompatible combinations are guarded before being translated to SQLAlchemy ORM behavior.",
+    "expected_sources": [
+      "sqlmodel/main.py"
+    ],
+    "must_include_any": [
+      "Relationship",
+      "metadata",
+      "SQLAlchemy",
+      "raise"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-session-cross-file",
+    "category": "cross-file",
+    "question": "How do SQLModel models flow into query execution with Session.exec()?",
+    "ground_truth": "Models are defined in the core SQLModel layer, queries are built through the SQL expression helpers such as select, and then those statements are executed through the SQLModel Session.exec wrapper, which ties model definitions and typed query execution together.",
+    "expected_sources": [
+      "sqlmodel/main.py",
+      "sqlmodel/sql/expression.py",
+      "sqlmodel/orm/session.py"
+    ],
+    "must_include_any": [
+      "select",
+      "Session",
+      "exec",
+      "model"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-sync-async-cross-file",
+    "category": "cross-file",
+    "question": "How does SQLModel support both sync and async session patterns across files?",
+    "ground_truth": "SQLModel exposes synchronous session helpers in its ORM session module and asynchronous support in the ext.asyncio package, giving similar exec-oriented ergonomics across both sync and async query paths.",
+    "expected_sources": [
+      "sqlmodel/orm/session.py",
+      "sqlmodel/ext/asyncio/session.py",
+      "sqlmodel/__init__.py"
+    ],
+    "must_include_any": [
+      "sync",
+      "async",
+      "Session",
+      "exec"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-field-to-table-flow",
+    "category": "cross-file",
+    "question": "How do typed Field declarations become SQL table columns in SQLModel?",
+    "ground_truth": "Typed model attributes and Field metadata are collected in the SQLModel core model layer, where SQLModel builds SQLAlchemy-compatible field and table metadata so the resulting class can participate in SQLAlchemy table creation and ORM mapping.",
+    "expected_sources": [
+      "sqlmodel/main.py",
+      "sqlmodel/_compat.py"
+    ],
+    "must_include_any": [
+      "Field",
+      "column",
+      "table",
+      "metadata"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-docs-fastapi-positioning",
+    "category": "docs",
+    "question": "How does the project describe SQLModel's relationship to FastAPI in its docs or README?",
+    "ground_truth": "The project describes SQLModel as being designed to simplify SQL database work in FastAPI applications and emphasizes that it is created by the same author, with strong compatibility between FastAPI, Pydantic, and SQLAlchemy.",
+    "expected_sources": [
+      "README.md",
+      "docs"
+    ],
+    "must_include_any": [
+      "FastAPI",
+      "same author",
+      "compatibility"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-followup-show-session-code",
+    "category": "conversation",
+    "turns": [
+      {
+        "role": "user",
+        "content": "How does SQLModel make query execution easier than raw SQLAlchemy?"
+      },
+      {
+        "role": "assistant",
+        "content": "It provides a Session.exec helper and package-level exports to simplify common query patterns."
+      }
+    ],
+    "question": "show me the code path for that",
+    "ground_truth": "The follow-up should stay anchored to Session.exec and SQLModel query ergonomics, retrieving code from the session wrapper and related SQLModel exports instead of drifting to README-only results.",
+    "expected_sources": [
+      "sqlmodel/orm/session.py",
+      "sqlmodel/__init__.py",
+      "sqlmodel/sql/expression.py"
+    ],
+    "must_include_any": [
+      "Session",
+      "exec",
+      "select"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-select-implementation-layer",
+    "category": "specific-function",
+    "question": "Where is select implemented under the hood and how is that different from how it is exposed publicly?",
+    "ground_truth": "SQLModel exposes select through package-level imports such as sqlmodel.__init__ and sqlmodel.sql.expression, while the implementation details and overload-heavy generation live in lower-level SQL expression modules like _expression_select_gen.py and related select classes.",
+    "expected_sources": [
+      "sqlmodel/__init__.py",
+      "sqlmodel/sql/expression.py",
+      "sqlmodel/sql/_expression_select_gen.py",
+      "sqlmodel/sql/_expression_select_cls.py"
+    ],
+    "must_include_any": [
+      "select",
+      "public",
+      "implementation",
+      "re-export"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-async-session-delegation",
+    "category": "cross-file",
+    "question": "How does AsyncSession.exec reuse the synchronous Session.exec path?",
+    "ground_truth": "The async session layer delegates execution to the synchronous Session.exec logic rather than duplicating it. AsyncSession uses greenlet-based bridging so async callers can reuse the sync execution wrapper and still get SQLModel-style exec ergonomics.",
+    "expected_sources": [
+      "sqlmodel/ext/asyncio/session.py",
+      "sqlmodel/orm/session.py"
+    ],
+    "must_include_any": [
+      "AsyncSession",
+      "Session",
+      "greenlet",
+      "exec"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-select-tutorial-usage",
+    "category": "docs",
+    "question": "How do the docs teach people to use select together with Session.exec?",
+    "ground_truth": "The tutorials show users building a statement with select(...) and then executing it through Session.exec(...), positioning exec as the ergonomic query entry point for SQLModel statements.",
+    "expected_sources": [
+      "docs/tutorial/select.md",
+      "README.md"
+    ],
+    "must_include_any": [
+      "select",
+      "Session",
+      "exec",
+      "statement"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-fastapi-response-model-docs",
+    "category": "docs",
+    "question": "How do the FastAPI docs describe using SQLModel models as response models?",
+    "ground_truth": "The FastAPI-focused docs explain that SQLModel classes can participate in API request and response modeling because they build on Pydantic, letting applications reuse models or related model variants in response_model patterns.",
+    "expected_sources": [
+      "docs/tutorial/fastapi/response-model.md",
+      "README.md"
+    ],
+    "must_include_any": [
+      "FastAPI",
+      "response_model",
+      "Pydantic",
+      "model"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-independent-library-positioning",
+    "category": "docs",
+    "question": "Does the project describe SQLModel as FastAPI-only or as a standalone library too?",
+    "ground_truth": "The docs position SQLModel as especially strong with FastAPI, but still as an independent library that can be used outside FastAPI. It is not described as FastAPI-only.",
+    "expected_sources": [
+      "README.md",
+      "docs/features.md",
+      "docs/index.md"
+    ],
+    "must_include_any": [
+      "FastAPI",
+      "independent",
+      "library",
+      "not"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-sa-relationship-test-guard",
+    "category": "tests",
+    "question": "What invalid Relationship combinations are guarded by tests?",
+    "ground_truth": "The relationship tests cover invalid combinations where a pre-built sa_relationship is mixed with sa_relationship_args or sa_relationship_kwargs, confirming that SQLModel raises when overlapping relationship configuration styles are combined.",
+    "expected_sources": [
+      "tests/test_field_sa_relationship.py",
+      "sqlmodel/main.py"
+    ],
+    "must_include_any": [
+      "sa_relationship",
+      "args",
+      "kwargs",
+      "raise"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-ondelete-nullable-test",
+    "category": "tests",
+    "question": "What does the project test about ondelete and nullable relationship fields?",
+    "ground_truth": "The test suite checks that using ondelete='SET NULL' on a non-nullable relationship field is invalid. The model layer should raise because SET NULL requires the underlying foreign key column to be nullable.",
+    "expected_sources": [
+      "tests/test_ondelete_raises.py",
+      "sqlmodel/main.py"
+    ],
+    "must_include_any": [
+      "ondelete",
+      "SET NULL",
+      "nullable",
+      "raise"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-type-validation-test",
+    "category": "tests",
+    "question": "What do the tests suggest about invalid SQLAlchemy or field type combinations in SQLModel?",
+    "ground_truth": "The tests indicate that SQLModel raises when unsupported or ambiguous field type combinations are mapped into SQLAlchemy columns, reinforcing that not every Python type annotation can become a database column shape automatically.",
+    "expected_sources": [
+      "tests/test_sqlalchemy_type_errors.py",
+      "sqlmodel/main.py"
+    ],
+    "must_include_any": [
+      "type",
+      "SQLAlchemy",
+      "raise",
+      "column"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-readme-engine-session-imports",
+    "category": "config-setup",
+    "question": "What top-level imports does the README encourage for getting started with SQLModel?",
+    "ground_truth": "The README encourages importing SQLModel, Field, Session, create_engine, and select from the top-level sqlmodel package so users can define models, create tables, and run queries with a unified import style.",
+    "expected_sources": [
+      "README.md",
+      "sqlmodel/__init__.py"
+    ],
+    "must_include_any": [
+      "SQLModel",
+      "Field",
+      "Session",
+      "create_engine",
+      "select"
+    ],
+    "min_keyword_matches": 4
+  },
+  {
+    "id": "sqlmodel-many-to-many-link-model-docs",
+    "category": "docs",
+    "question": "How do the relationship docs explain link_model for many-to-many mappings?",
+    "ground_truth": "The relationship docs explain that link_model is used as an association or link table model for many-to-many relationships, letting SQLModel connect two models through an explicit intermediary model.",
+    "expected_sources": [
+      "docs/tutorial/many-to-many/create-models-with-link.md",
+      "sqlmodel/main.py"
+    ],
+    "must_include_any": [
+      "link_model",
+      "many-to-many",
+      "association",
+      "relationship"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-followup-async-code-path",
+    "category": "conversation",
+    "turns": [
+      {
+        "role": "user",
+        "content": "How does async query execution work in SQLModel?"
+      },
+      {
+        "role": "assistant",
+        "content": "It uses AsyncSession and bridges into the sync session execution path."
+      }
+    ],
+    "question": "show me where that bridge happens",
+    "ground_truth": "The follow-up should stay on the async execution path and retrieve the async session module together with the sync session module it delegates to, rather than drifting to docs-only summaries.",
+    "expected_sources": [
+      "sqlmodel/ext/asyncio/session.py",
+      "sqlmodel/orm/session.py"
+    ],
+    "must_include_any": [
+      "AsyncSession",
+      "greenlet",
+      "Session",
+      "exec"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-followup-field-column-path",
+    "category": "conversation",
+    "turns": [
+      {
+        "role": "user",
+        "content": "How do typed Field declarations become SQL table columns in SQLModel?"
+      },
+      {
+        "role": "assistant",
+        "content": "The metaclass and field helpers translate Field metadata into SQLAlchemy Column objects."
+      }
+    ],
+    "question": "show me the main code path for that conversion",
+    "ground_truth": "The follow-up should stay anchored to the field-to-column conversion path in the core model implementation instead of drifting to tutorial prose alone.",
+    "expected_sources": [
+      "sqlmodel/main.py",
+      "sqlmodel/_compat.py"
+    ],
+    "must_include_any": [
+      "Field",
+      "Column",
+      "metaclass",
+      "conversion"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-followup-select-public-path",
+    "category": "conversation",
+    "turns": [
+      {
+        "role": "user",
+        "content": "How is select exposed to users in SQLModel?"
+      },
+      {
+        "role": "assistant",
+        "content": "It is re-exported for public use from the SQLModel package and expression layer."
+      }
+    ],
+    "question": "and where is the lower-level implementation behind that?",
+    "ground_truth": "The follow-up should connect the public export path to the lower-level select generator and select class implementation files instead of repeating only the package-level export story.",
+    "expected_sources": [
+      "sqlmodel/sql/expression.py",
+      "sqlmodel/sql/_expression_select_gen.py",
+      "sqlmodel/sql/_expression_select_cls.py"
+    ],
+    "must_include_any": [
+      "select",
+      "implementation",
+      "expression",
+      "class"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-test-vs-core-evidence-balance",
+    "category": "cross-file",
+    "question": "When explaining configuration errors in SQLModel, how should core implementation and tests complement each other?",
+    "ground_truth": "The core implementation in sqlmodel/main.py is the canonical source for behavior, while tests such as relationship and ondelete checks provide evidence that those guards are enforced in concrete scenarios. A good answer should balance both without treating tests as the primary implementation source.",
+    "expected_sources": [
+      "sqlmodel/main.py",
+      "tests/test_field_sa_relationship.py",
+      "tests/test_ondelete_raises.py"
+    ],
+    "must_include_any": [
+      "main.py",
+      "tests",
+      "canonical",
+      "guard"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-docs-vs-core-select-balance",
+    "category": "cross-file",
+    "question": "For explaining select in SQLModel, which files are canonical implementation sources and which are usage-oriented docs?",
+    "ground_truth": "The canonical implementation path is in sqlmodel.__init__, sqlmodel.sql.expression, and the lower-level select generator or select class modules, while files like README and docs/tutorial/select.md are usage-oriented documentation rather than the implementation itself.",
+    "expected_sources": [
+      "sqlmodel/__init__.py",
+      "sqlmodel/sql/expression.py",
+      "sqlmodel/sql/_expression_select_gen.py",
+      "docs/tutorial/select.md",
+      "README.md"
+    ],
+    "must_include_any": [
+      "canonical",
+      "implementation",
+      "docs",
+      "usage"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-features-doc-positioning",
+    "category": "docs",
+    "question": "What themes does the features documentation emphasize about SQLModel's value proposition?",
+    "ground_truth": "The features docs emphasize reduced duplication, editor friendliness, compatibility across Pydantic and SQLAlchemy, and an ergonomic way to work with SQL databases using standard Python type hints and models.",
+    "expected_sources": [
+      "docs/features.md",
+      "README.md"
+    ],
+    "must_include_any": [
+      "duplication",
+      "editor",
+      "compatibility",
+      "Python"
+    ],
+    "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-docs-index-overview",
+    "category": "docs",
+    "question": "What kind of project overview should a user get from the docs index for SQLModel?",
+    "ground_truth": "The docs index should frame SQLModel as a Python SQL library that combines data modeling and database access patterns, point users toward tutorials or feature explanations, and reinforce its relationship to Pydantic, SQLAlchemy, and FastAPI.",
+    "expected_sources": [
+      "docs/index.md",
+      "README.md"
+    ],
+    "must_include_any": [
+      "overview",
+      "Pydantic",
+      "SQLAlchemy",
+      "FastAPI"
+    ],
+    "min_keyword_matches": 2
+  }
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+fastapi==0.109.2
+uvicorn[standard]==0.27.1
+sqlalchemy==2.0.25
+pydantic==2.6.1
+python-dotenv==1.0.1
+openai==1.12.0
+google-genai==1.12.1
+httpx==0.28.1
+numpy==1.26.4
+rank-bm25==0.2.2
+qdrant-client==1.15.1
+sentence-transformers==2.7.0
+einops==0.8.1
+tree-sitter==0.21.3
+tree-sitter-languages==1.10.2
+ragas==0.1.10
+datasets==2.18.0
+pandas==2.2.0

server_app.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import os
+from pathlib import Path
+from typing import List, Literal, Optional
+from fastapi import BackgroundTasks, Depends, FastAPI, Header, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field, HttpUrl
+from dotenv import load_dotenv
+from src.rag_system import CodebaseRAGSystem
+load_dotenv(Path(__file__).with_name(".env"))
+app = FastAPI(
+    title="Codebase RAG API",
+    description="Index GitHub repositories and answer natural-language questions with grounded citations.",
+    version="2.0.0",
+)
+cors_origins = [
+    origin.strip()
+    for origin in os.getenv("CORS_ORIGINS", "http://localhost:3000").split(",")
+    if origin.strip()
+]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=cors_origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+rag_system: Optional[CodebaseRAGSystem] = None
+class RepoIndexRequest(BaseModel):
+    github_url: HttpUrl
+class QueryRequest(BaseModel):
+    repo_id: int = Field(..., ge=1)
+    question: str = Field(..., min_length=3)
+    top_k: int = Field(8, ge=3, le=12)
+    history: List["MessageTurn"] = Field(default_factory=list, max_length=8)
+class MessageTurn(BaseModel):
+    role: Literal["user", "assistant"]
+    content: str = Field(..., min_length=1, max_length=4000)
+def require_session_id(x_session_id: Optional[str] = Header(None, alias="X-Session-Id")) -> str:
+    if not x_session_id or not x_session_id.strip():
+        raise HTTPException(status_code=400, detail="Missing session id")
+    return x_session_id.strip()
+@app.on_event("startup")
+def startup():
+    global rag_system
+    Path("./data").mkdir(exist_ok=True)
+    rag_system = CodebaseRAGSystem()
+@app.get("/")
+async def root():
+    return {
+        "status": "online",
+        "message": "Codebase RAG API is running",
+    }
+@app.get("/api/health")
+async def health():
+    return {
+        "status": "ok",
+    }
+@app.get("/api/repos")
+async def list_repositories(session_id: str = Depends(require_session_id)):
+    return rag_system.list_repositories_for_session(session_id)
+@app.get("/api/repos/{repo_id}")
+async def get_repository(repo_id: int, session_id: str = Depends(require_session_id)):
+    repo = rag_system.get_repository_for_session(repo_id, session_id)
+    if not repo:
+        raise HTTPException(status_code=404, detail="Repository not found")
+    return repo
+@app.post("/api/repos/index")
+async def queue_repository_index(
+    request: RepoIndexRequest,
+    background_tasks: BackgroundTasks,
+    session_id: str = Depends(require_session_id),
+):
+    try:
+        repo = rag_system.create_or_reset_repository(str(request.github_url), session_id)
+        background_tasks.add_task(rag_system.index_repository, repo.id)
+        return {
+            "success": True,
+            "message": "Repository indexing started",
+            "repo": rag_system.get_repository_for_session(repo.id, session_id),
+        }
+    except Exception as exc:
+        raise HTTPException(status_code=400, detail=str(exc))
+@app.post("/api/query")
+async def query_repository(request: QueryRequest, session_id: str = Depends(require_session_id)):
+    try:
+        return rag_system.answer_question(
+            repo_id=request.repo_id,
+            session_key=session_id,
+            question=request.question.strip(),
+            top_k=request.top_k,
+            history=request.history,
+        )
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc))
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=str(exc))
+@app.post("/api/session/end")
+async def end_session(session_id: str = Query(..., min_length=8)):
+    rag_system.end_session(session_id)
+    return {"success": True}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("server_app:app", host="0.0.0.0", port=8000, reload=True)

src/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""
+Codebase RAG backend package.
+"""
+from .code_parser import CodeParser
+from .embeddings import EmbeddingGenerator
+from .hybrid_search import HybridSearchEngine
+from .rag_system import CodebaseRAGSystem
+from .repo_fetcher import RepoFetcher
+from .vector_store import QdrantVectorStore
+__version__ = "2.0.0"
+__all__ = [
+    "CodeParser",
+    "CodebaseRAGSystem",
+    "EmbeddingGenerator",
+    "QdrantVectorStore",
+    "HybridSearchEngine",
+    "RepoFetcher",
+]

src/code_parser.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import re
+from pathlib import Path
+from typing import Dict, List, Optional
+from tree_sitter_languages import get_parser
+LANGUAGE_BY_EXTENSION = {
+    ".py": "python",
+    ".js": "javascript",
+    ".jsx": "javascript",
+    ".ts": "typescript",
+    ".tsx": "tsx",
+    ".java": "java",
+    ".go": "go",
+    ".rs": "rust",
+}
+SYMBOL_NODE_TYPES = {
+    "python": {"function_definition", "class_definition"},
+    "javascript": {
+        "function_declaration",
+        "class_declaration",
+        "method_definition",
+        "generator_function_declaration",
+        "lexical_declaration",
+        "variable_declaration",
+    },
+    "typescript": {
+        "function_declaration",
+        "class_declaration",
+        "method_definition",
+        "interface_declaration",
+        "type_alias_declaration",
+        "lexical_declaration",
+        "variable_statement",
+    },
+    "tsx": {
+        "function_declaration",
+        "class_declaration",
+        "method_definition",
+        "interface_declaration",
+        "type_alias_declaration",
+        "lexical_declaration",
+        "variable_statement",
+    },
+    "java": {
+        "class_declaration",
+        "method_declaration",
+        "interface_declaration",
+        "enum_declaration",
+    },
+    "go": {
+        "function_declaration",
+        "method_declaration",
+        "type_declaration",
+    },
+    "rust": {
+        "function_item",
+        "impl_item",
+        "struct_item",
+        "enum_item",
+        "trait_item",
+    },
+}
+IDENTIFIER_TYPES = {
+    "identifier",
+    "property_identifier",
+    "type_identifier",
+    "field_identifier",
+}
+class CodeParser:
+    def __init__(self):
+        self.parsers = {}
+    def detect_language(self, file_path: str) -> str:
+        return LANGUAGE_BY_EXTENSION.get(Path(file_path).suffix.lower(), "text")
+    def _get_parser(self, language: str):
+        if language == "text":
+            return None
+        if language not in self.parsers:
+            self.parsers[language] = get_parser(language)
+        return self.parsers[language]
+    def chunk_file(self, file_path: str, repo_root: str) -> List[Dict]:
+        language = self.detect_language(file_path)
+        source = Path(file_path).read_text(encoding="utf-8", errors="ignore")
+        relative_path = str(Path(file_path).resolve().relative_to(Path(repo_root).resolve()))
+        if not source.strip():
+            return []
+        parser = self._get_parser(language)
+        if parser is None:
+            return self._fallback_chunks(source, relative_path, language)
+        tree = parser.parse(bytes(source, "utf-8"))
+        lines = source.splitlines()
+        chunks = []
+        capture_types = SYMBOL_NODE_TYPES.get(language, set())
+        def visit(node):
+            if node.type in capture_types:
+                chunk = self._build_chunk(node, source, lines, relative_path, language)
+                if chunk:
+                    chunks.append(chunk)
+                    return
+            for child in node.children:
+                visit(child)
+        visit(tree.root_node)
+        if not chunks:
+            return self._fallback_chunks(source, relative_path, language)
+        return chunks
+    def _build_chunk(self, node, source: str, lines: List[str], relative_path: str, language: str) -> Optional[Dict]:
+        start_line = node.start_point[0] + 1
+        end_line = node.end_point[0] + 1
+        snippet = "\n".join(lines[start_line - 1 : end_line]).strip()
+        if len(snippet.splitlines()) < 2:
+            return None
+        name_node = node.child_by_field_name("name")
+        symbol_name = None
+        if name_node is not None:
+            symbol_name = source[name_node.start_byte : name_node.end_byte].strip()
+        if not symbol_name:
+            symbol_name = self._find_identifier(node, source)
+        signature = lines[start_line - 1].strip() if start_line - 1 < len(lines) else ""
+        searchable_text = "\n".join(
+            part for part in [relative_path, symbol_name or "", signature, snippet] if part
+        )
+        return {
+            "file_path": relative_path,
+            "language": language,
+            "symbol_name": symbol_name or relative_path.split("/")[-1],
+            "symbol_type": node.type,
+            "line_start": start_line,
+            "line_end": end_line,
+            "signature": signature,
+            "content": snippet,
+            "searchable_text": searchable_text,
+            "metadata_json": {
+                "parser": "tree-sitter",
+            },
+        }
+    def _find_identifier(self, node, source: str) -> Optional[str]:
+        stack = list(node.children)
+        while stack:
+            current = stack.pop(0)
+            if current.type in IDENTIFIER_TYPES:
+                return source[current.start_byte : current.end_byte].strip()
+            stack.extend(current.children)
+        return None
+    def _fallback_chunks(self, source: str, relative_path: str, language: str) -> List[Dict]:
+        blocks = []
+        lines = source.splitlines()
+        buffer = []
+        start_line = 1
+        for index, line in enumerate(lines, start=1):
+            if not buffer:
+                start_line = index
+            buffer.append(line)
+            trigger = False
+            if language == "text":
+                trigger = len(buffer) >= 60 or (line.startswith("#") and len(buffer) > 8)
+            else:
+                trigger = (
+                    re.match(r"^\s*(def |class |function |const |export |interface |type )", line)
+                    and len(buffer) > 8
+                ) or len(buffer) >= 80
+            if trigger:
+                chunk_text = "\n".join(buffer).strip()
+                if chunk_text:
+                    blocks.append(
+                        {
+                            "file_path": relative_path,
+                            "language": language,
+                            "symbol_name": f"{Path(relative_path).name}:{start_line}",
+                            "symbol_type": "fallback_chunk",
+                            "line_start": start_line,
+                            "line_end": index,
+                            "signature": buffer[0].strip(),
+                            "content": chunk_text,
+                            "searchable_text": f"{relative_path}\n{chunk_text}",
+                            "metadata_json": {
+                                "parser": "fallback",
+                            },
+                        }
+                    )
+                buffer = []
+        if buffer:
+            chunk_text = "\n".join(buffer).strip()
+            if chunk_text:
+                blocks.append(
+                    {
+                        "file_path": relative_path,
+                        "language": language,
+                        "symbol_name": f"{Path(relative_path).name}:{start_line}",
+                        "symbol_type": "fallback_chunk",
+                        "line_start": start_line,
+                        "line_end": len(lines),
+                        "signature": buffer[0].strip(),
+                        "content": chunk_text,
+                        "searchable_text": f"{relative_path}\n{chunk_text}",
+                        "metadata_json": {
+                            "parser": "fallback",
+                        },
+                    }
+                )
+        return blocks

src/database.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+from datetime import datetime
+from pathlib import Path
+from sqlalchemy import (
+    JSON,
+    Column,
+    DateTime,
+    Float,
+    ForeignKey,
+    Integer,
+    String,
+    Text,
+    create_engine,
+    inspect,
+    text,
+)
+from sqlalchemy.orm import declarative_base, relationship, sessionmaker
+Base = declarative_base()
+_ENGINE_CACHE = {}
+_SESSION_FACTORY_CACHE = {}
+SERVER_DIR = Path(__file__).resolve().parents[1]
+class Repository(Base):
+    __tablename__ = "repositories"
+    id = Column(Integer, primary_key=True)
+    github_url = Column(String(1024), nullable=False, unique=True)
+    source_url = Column(String(1024))
+    session_key = Column(String(255), index=True)
+    session_expires_at = Column(DateTime)
+    owner = Column(String(255), nullable=False)
+    name = Column(String(255), nullable=False)
+    branch = Column(String(255), nullable=False, default="main")
+    local_path = Column(String(1024))
+    status = Column(String(64), nullable=False, default="queued")
+    error_message = Column(Text)
+    file_count = Column(Integer, nullable=False, default=0)
+    chunk_count = Column(Integer, nullable=False, default=0)
+    indexed_at = Column(DateTime)
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+    chunks = relationship(
+        "CodeChunk", back_populates="repository", cascade="all, delete-orphan"
+    )
+    chat_turns = relationship(
+        "ChatTurn", back_populates="repository", cascade="all, delete-orphan"
+    )
+class CodeChunk(Base):
+    __tablename__ = "code_chunks"
+    id = Column(Integer, primary_key=True)
+    repository_id = Column(Integer, ForeignKey("repositories.id"), nullable=False)
+    file_path = Column(String(1024), nullable=False)
+    language = Column(String(64), nullable=False)
+    symbol_name = Column(String(255))
+    symbol_type = Column(String(128), nullable=False, default="chunk")
+    line_start = Column(Integer, nullable=False)
+    line_end = Column(Integer, nullable=False)
+    signature = Column(Text)
+    content = Column(Text, nullable=False)
+    searchable_text = Column(Text, nullable=False)
+    metadata_json = Column(JSON, nullable=False, default=dict)
+    embedding_id = Column(Integer)
+    rerank_score = Column(Float)
+    created_at = Column(DateTime, default=datetime.utcnow)
+    repository = relationship("Repository", back_populates="chunks")
+class ChatTurn(Base):
+    __tablename__ = "chat_turns"
+    id = Column(Integer, primary_key=True)
+    repository_id = Column(Integer, ForeignKey("repositories.id"), nullable=False)
+    role = Column(String(32), nullable=False)
+    content = Column(Text, nullable=False)
+    answer_json = Column(JSON)
+    created_at = Column(DateTime, default=datetime.utcnow)
+    repository = relationship("Repository", back_populates="chat_turns")
+def init_db(database_url: str = None):
+    if database_url is None:
+        database_url = os.getenv("DATABASE_URL", "sqlite:///./codebase_rag.db")
+    database_url = resolve_database_url(database_url)
+    if database_url in _ENGINE_CACHE:
+        return _ENGINE_CACHE[database_url], _SESSION_FACTORY_CACHE[database_url]
+    connect_args = {"check_same_thread": False} if database_url.startswith("sqlite") else {}
+    engine = create_engine(database_url, echo=False, connect_args=connect_args)
+    Base.metadata.create_all(engine)
+    _ensure_runtime_columns(engine)
+    session_local = sessionmaker(bind=engine)
+    _ENGINE_CACHE[database_url] = engine
+    _SESSION_FACTORY_CACHE[database_url] = session_local
+    return engine, session_local
+def resolve_database_url(database_url: str) -> str:
+    if not database_url.startswith("sqlite:///"):
+        return database_url
+    sqlite_path = database_url.removeprefix("sqlite:///")
+    if sqlite_path == ":memory:":
+        return database_url
+    path = Path(sqlite_path)
+    if not path.is_absolute():
+        path = SERVER_DIR / path
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.touch(exist_ok=True)
+    return f"sqlite:///{path.resolve()}"
+def _ensure_runtime_columns(engine):
+    inspector = inspect(engine)
+    if "repositories" not in inspector.get_table_names():
+        return
+    existing = {column["name"] for column in inspector.get_columns("repositories")}
+    alterations = {
+        "source_url": "ALTER TABLE repositories ADD COLUMN source_url VARCHAR(1024)",
+        "session_key": "ALTER TABLE repositories ADD COLUMN session_key VARCHAR(255)",
+        "session_expires_at": "ALTER TABLE repositories ADD COLUMN session_expires_at DATETIME",
+    }
+    with engine.begin() as connection:
+        for column_name, statement in alterations.items():
+            if column_name not in existing:
+                connection.execute(text(statement))
+def get_db_session(database_url: str = None):
+    _, session_local = init_db(database_url)
+    return session_local()

src/document_processor.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import hashlib
+from typing import List, Tuple
+from pathlib import Path
+import pypdf
+class DocumentProcessor:
+    def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def extract_text_from_pdf(self, file_path: str) -> str:
+        text = ""
+        try:
+            with open(file_path, "rb") as file:
+                pdf_reader = pypdf.PdfReader(file)
+                for page in pdf_reader.pages:
+                    text += page.extract_text() + "\n"
+        except Exception as e:
+            raise ValueError(f"Error reading PDF: {str(e)}")
+        return text.strip()
+    def chunk_text(self, text: str) -> List[str]:
+        if not text:
+            return []
+        chunks = []
+        start = 0
+        text_length = len(text)
+        while start < text_length:
+            end = start + self.chunk_size
+            chunk = text[start:end]
+            if end < text_length:
+                last_period = chunk.rfind(".")
+                last_newline = chunk.rfind("\n")
+                break_point = max(last_period, last_newline)
+                if break_point > self.chunk_size * 0.5:
+                    chunk = chunk[: break_point + 1]
+                    end = start + break_point + 1
+            chunks.append(chunk.strip())
+            start = end - self.chunk_overlap
+        return [c for c in chunks if c]
+    def process_document(self, file_path: str) -> Tuple[str, List[str]]:
+        file_ext = Path(file_path).suffix.lower()
+        if file_ext == ".pdf":
+            text = self.extract_text_from_pdf(file_path)
+        elif file_ext == ".txt":
+            with open(file_path, "r", encoding="utf-8") as f:
+                text = f.read()
+        else:
+            raise ValueError(f"Unsupported file type: {file_ext}")
+        chunks = self.chunk_text(text)
+        return text, chunks
+    @staticmethod
+    def compute_file_hash(file_path: str) -> str:
+        hash_md5 = hashlib.md5()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()

src/embeddings.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import os
+import time
+from typing import Callable, List, Optional
+import numpy as np
+from openai import OpenAI
+from sentence_transformers import SentenceTransformer
+class EmbeddingGenerator:
+    def __init__(self, provider: str = None, model_name: str = None):
+        configured_provider = (provider or os.getenv("EMBEDDING_PROVIDER", "auto")).lower()
+        self.provider = self._resolve_provider(configured_provider)
+        self.model_name = model_name or self._resolve_model_name()
+        self.batch_size = int(os.getenv("EMBEDDING_BATCH_SIZE", "8"))
+        self.device = os.getenv("EMBEDDING_DEVICE")
+        self.client = None
+        self.model = None
+        self.vertex_task_type_document = os.getenv(
+            "VERTEX_EMBEDDING_TASK_TYPE_DOCUMENT", "RETRIEVAL_DOCUMENT"
+        )
+        self.vertex_task_type_query = os.getenv(
+            "VERTEX_EMBEDDING_TASK_TYPE_QUERY", "RETRIEVAL_QUERY"
+        )
+        self.vertex_output_dimensionality = self._optional_int(
+            os.getenv("VERTEX_EMBEDDING_OUTPUT_DIMENSIONALITY")
+        )
+        self.query_prefix = os.getenv("EMBEDDING_QUERY_PREFIX", "").strip()
+        normalized_model_name = self.model_name.lower()
+        self.query_prompt_name = (
+            os.getenv("EMBEDDING_QUERY_PROMPT_NAME", "query")
+            if "nomic-embed-code" in normalized_model_name
+            or "coderankembed" in normalized_model_name
+            else None
+        )
+        if self.provider == "openai":
+            print(
+                f"[embeddings] Initializing OpenAI embeddings with model={self.model_name}",
+                flush=True,
+            )
+            self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+            self.embedding_dim = int(os.getenv("OPENAI_EMBEDDING_DIM", "1536"))
+        elif self.provider == "vertex_ai":
+            print(
+                f"[embeddings] Initializing Vertex AI embeddings with model={self.model_name}",
+                flush=True,
+            )
+            try:
+                from google import genai
+            except ImportError as exc:
+                raise RuntimeError(
+                    "Vertex AI embedding support requires the `google-genai` package."
+                ) from exc
+            project = os.getenv("GOOGLE_CLOUD_PROJECT")
+            location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
+            if not project:
+                raise RuntimeError(
+                    "GOOGLE_CLOUD_PROJECT must be set when using Vertex AI embeddings."
+                )
+            self.client = genai.Client(
+                vertexai=True,
+                project=project,
+                location=location,
+            )
+            self.embedding_dim = int(
+                os.getenv(
+                    "VERTEX_EMBEDDING_DIM",
+                    str(self.vertex_output_dimensionality or 3072),
+                )
+            )
+        else:
+            model_device = self.device or "cpu"
+            print(
+                f"[embeddings] Loading local embedding model={self.model_name} on device={model_device}",
+                flush=True,
+            )
+            started_at = time.perf_counter()
+            self.model = SentenceTransformer(
+                self.model_name,
+                trust_remote_code=True,
+                device=model_device,
+            )
+            self.embedding_dim = self.model.get_sentence_embedding_dimension()
+            elapsed = time.perf_counter() - started_at
+            print(
+                f"[embeddings] Model ready dim={self.embedding_dim} load_time={elapsed:.2f}s",
+                flush=True,
+            )
+    def embed_text(self, text: str) -> np.ndarray:
+        if self.provider == "openai":
+            return self.embed_batch([text])[0]
+        if self.provider == "vertex_ai":
+            return self._embed_with_vertex(
+                [text],
+                task_type=self.vertex_task_type_query,
+            )[0]
+        query_text = f"{self.query_prefix}: {text}" if self.query_prefix else text
+        return self._encode_with_backoff([query_text], prompt_name=self.query_prompt_name)[0]
+    def embed_batch(
+        self,
+        texts: List[str],
+        batch_size: int = None,
+        progress_callback: Optional[Callable[[int, int], None]] = None,
+    ) -> np.ndarray:
+        if not texts:
+            return np.array([], dtype="float32")
+        if self.provider == "openai":
+            response = self.client.embeddings.create(
+                model=self.model_name or "text-embedding-3-small",
+                input=texts,
+            )
+            embeddings = [item.embedding for item in response.data]
+            if progress_callback:
+                progress_callback(len(texts), len(texts))
+            return np.array(embeddings, dtype="float32")
+        if self.provider == "vertex_ai":
+            return self._embed_batch_with_vertex(
+                texts=texts,
+                batch_size=batch_size,
+                progress_callback=progress_callback,
+            )
+        effective_batch_size = max(1, batch_size or self.batch_size)
+        all_embeddings = []
+        total = len(texts)
+        for start in range(0, total, effective_batch_size):
+            batch = texts[start : start + effective_batch_size]
+            batch_number = (start // effective_batch_size) + 1
+            total_batches = (total + effective_batch_size - 1) // effective_batch_size
+            print(
+                f"[embeddings] Encoding batch {batch_number}/{total_batches} "
+                f"items={len(batch)} progress={start}/{total}",
+                flush=True,
+            )
+            started_at = time.perf_counter()
+            batch_embeddings = self._encode_with_backoff(
+                batch,
+                batch_size=min(effective_batch_size, len(batch)),
+            )
+            all_embeddings.append(batch_embeddings)
+            elapsed = time.perf_counter() - started_at
+            print(
+                f"[embeddings] Finished batch {batch_number}/{total_batches} "
+                f"elapsed={elapsed:.2f}s progress={min(start + len(batch), total)}/{total}",
+                flush=True,
+            )
+            if progress_callback:
+                progress_callback(min(start + len(batch), total), total)
+        return np.vstack(all_embeddings).astype("float32")
+    def _embed_batch_with_vertex(
+        self,
+        texts: List[str],
+        batch_size: int = None,
+        progress_callback: Optional[Callable[[int, int], None]] = None,
+    ) -> np.ndarray:
+        effective_batch_size = max(1, batch_size or self.batch_size)
+        all_embeddings = []
+        total = len(texts)
+        for start in range(0, total, effective_batch_size):
+            batch = texts[start : start + effective_batch_size]
+            batch_number = (start // effective_batch_size) + 1
+            total_batches = (total + effective_batch_size - 1) // effective_batch_size
+            print(
+                f"[embeddings] Vertex batch {batch_number}/{total_batches} "
+                f"items={len(batch)} progress={start}/{total}",
+                flush=True,
+            )
+            started_at = time.perf_counter()
+            batch_embeddings = self._embed_with_vertex(
+                batch,
+                task_type=self.vertex_task_type_document,
+            )
+            all_embeddings.append(batch_embeddings)
+            elapsed = time.perf_counter() - started_at
+            print(
+                f"[embeddings] Finished Vertex batch {batch_number}/{total_batches} "
+                f"elapsed={elapsed:.2f}s progress={min(start + len(batch), total)}/{total}",
+                flush=True,
+            )
+            if progress_callback:
+                progress_callback(min(start + len(batch), total), total)
+        return np.vstack(all_embeddings).astype("float32")
+    def _embed_with_vertex(self, texts: List[str], task_type: str) -> np.ndarray:
+        config = {
+            "task_type": task_type,
+        }
+        if self.vertex_output_dimensionality:
+            config["output_dimensionality"] = self.vertex_output_dimensionality
+        response = self.client.models.embed_content(
+            model=self.model_name,
+            contents=texts,
+            config=config,
+        )
+        embeddings = getattr(response, "embeddings", None)
+        if not embeddings:
+            raise RuntimeError("Vertex AI embeddings returned an empty response.")
+        values = []
+        for item in embeddings:
+            if hasattr(item, "values"):
+                values.append(item.values)
+            elif isinstance(item, dict):
+                values.append(item.get("values"))
+            else:
+                values.append(getattr(item, "embedding", None))
+        if not values or any(vector is None for vector in values):
+            raise RuntimeError("Vertex AI embeddings response could not be parsed.")
+        return np.array(values, dtype="float32")
+    def _encode_with_backoff(
+        self,
+        texts: List[str],
+        batch_size: int = None,
+        prompt_name: str = None,
+    ) -> np.ndarray:
+        effective_batch_size = max(1, batch_size or self.batch_size)
+        while True:
+            try:
+                encode_kwargs = {
+                    "sentences": texts,
+                    "batch_size": effective_batch_size,
+                    "show_progress_bar": len(texts) > effective_batch_size,
+                    "convert_to_numpy": True,
+                    "normalize_embeddings": True,
+                }
+                if prompt_name:
+                    encode_kwargs["prompt_name"] = prompt_name
+                embeddings = self.model.encode(
+                    **encode_kwargs,
+                )
+                return embeddings.astype("float32")
+            except RuntimeError as exc:
+                message = str(exc).lower()
+                is_memory_error = "out of memory" in message or "mps" in message
+                if not is_memory_error or effective_batch_size == 1:
+                    raise
+                print(
+                    f"[embeddings] Retrying batch with smaller size due to memory pressure: "
+                    f"{effective_batch_size} -> {max(1, effective_batch_size // 2)}",
+                    flush=True,
+                )
+                effective_batch_size = max(1, effective_batch_size // 2)
+    def get_embedding_dim(self) -> int:
+        return self.embedding_dim
+    def _resolve_provider(self, configured_provider: str) -> str:
+        if configured_provider != "auto":
+            return configured_provider
+        if self._is_hf_space() or self._is_test_context():
+            return "local"
+        return "vertex_ai"
+    def _resolve_model_name(self) -> str:
+        explicit_model = os.getenv("EMBEDDING_MODEL")
+        if explicit_model:
+            return explicit_model
+        if self.provider == "vertex_ai":
+            return os.getenv("VERTEX_EMBEDDING_MODEL", "gemini-embedding-001")
+        if self._is_hf_space() or self._is_test_context():
+            return os.getenv(
+                "LIGHTWEIGHT_LOCAL_EMBEDDING_MODEL",
+                "sentence-transformers/all-MiniLM-L6-v2",
+            )
+        return os.getenv("LOCAL_EMBEDDING_MODEL", "nomic-ai/CodeRankEmbed")
+    def _is_hf_space(self) -> bool:
+        return bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
+    def _is_test_context(self) -> bool:
+        app_env = os.getenv("APP_ENV", os.getenv("ENVIRONMENT", "")).lower()
+        return app_env == "test" or bool(os.getenv("PYTEST_CURRENT_TEST"))
+    def _optional_int(self, value: Optional[str]) -> Optional[int]:
+        if value is None or not str(value).strip():
+            return None
+        return int(value)

src/hybrid_search.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import math
+import re
+from collections import defaultdict
+from typing import List
+from rank_bm25 import BM25Okapi
+from sentence_transformers import CrossEncoder
+TOKEN_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_./:-]*")
+def tokenize(text: str) -> List[str]:
+    return [token.lower() for token in TOKEN_RE.findall(text)]
+class HybridSearchEngine:
+    def __init__(self, reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
+        self.reranker = CrossEncoder(reranker_model)
+    def build_for_repository(self, repo_id: int, chunks: List[dict]):
+        return None
+    def remove_repository(self, repo_id: int):
+        return None
+    def bm25_search(self, chunks: List[dict], query: str, top_k: int = 12) -> List[dict]:
+        if not chunks:
+            return []
+        tokens = tokenize(query)
+        if not tokens:
+            return []
+        corpus_tokens = [tokenize(chunk["searchable_text"]) for chunk in chunks]
+        bm25 = BM25Okapi(corpus_tokens) if corpus_tokens else None
+        if not bm25:
+            return []
+        scores = bm25.get_scores(tokens)
+        ranked = sorted(
+            zip(chunks, scores),
+            key=lambda item: item[1],
+            reverse=True,
+        )[:top_k]
+        results = []
+        for rank, (chunk, score) in enumerate(ranked, start=1):
+            chunk = dict(chunk)
+            chunk["bm25_score"] = float(score)
+            chunk["bm25_rank"] = rank
+            results.append(chunk)
+        return results
+    def reciprocal_rank_fusion(
+        self,
+        lexical_results: List[dict],
+        semantic_results: List[dict],
+        top_k: int = 10,
+        k: int = 60,
+    ) -> List[dict]:
+        fused = defaultdict(lambda: {"rrf_score": 0.0})
+        for rank, item in enumerate(lexical_results, start=1):
+            fused[item["id"]]["rrf_score"] += 1.0 / (k + rank)
+            fused[item["id"]].update(item)
+        for rank, item in enumerate(semantic_results, start=1):
+            fused[item["id"]]["rrf_score"] += 1.0 / (k + rank)
+            fused[item["id"]].update(item)
+        merged = sorted(fused.values(), key=lambda item: item["rrf_score"], reverse=True)
+        return merged[:top_k]
+    def rerank(self, query: str, candidates: List[dict], top_k: int = 6) -> List[dict]:
+        if not candidates:
+            return []
+        pairs = [
+            [query, f'{item["file_path"]}\n{item.get("signature") or ""}\n{item["content"]}']
+            for item in candidates
+        ]
+        scores = self.reranker.predict(pairs)
+        reranked = []
+        for item, score in zip(candidates, scores):
+            enriched = dict(item)
+            enriched["rerank_score"] = float(score)
+            reranked.append(enriched)
+        reranked.sort(key=lambda item: item["rerank_score"], reverse=True)
+        return reranked[:top_k]
+    @staticmethod
+    def normalize_semantic_results(results: List[dict]) -> List[dict]:
+        normalized = []
+        for rank, item in enumerate(results, start=1):
+            enriched = dict(item)
+            enriched["semantic_rank"] = rank
+            enriched["semantic_score"] = float(item.get("semantic_score", 0.0))
+            normalized.append(enriched)
+        return normalized

src/rag_system.py ADDED Viewed

	@@ -0,0 +1,1145 @@

+import os
+import re
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional
+from openai import OpenAI
+from src.code_parser import CodeParser
+from src.database import Repository, get_db_session, init_db, resolve_database_url
+from src.embeddings import EmbeddingGenerator
+from src.hybrid_search import HybridSearchEngine
+from src.repo_fetcher import RepoFetcher
+from src.vector_store import QdrantVectorStore
+class SessionCancelledError(RuntimeError):
+    pass
+class CodebaseRAGSystem:
+    def __init__(
+        self,
+        database_url: str = None,
+        repo_dir: str = None,
+        index_path: str = None,
+    ):
+        self.database_url = database_url or os.getenv(
+            "DATABASE_URL", "sqlite:///./codebase_rag.db"
+        )
+        self.database_url = resolve_database_url(self.database_url)
+        init_db(self.database_url)
+        print(f"[database] Using database_url={self.database_url}", flush=True)
+        self.repo_fetcher = RepoFetcher(base_dir=repo_dir)
+        self.parser = CodeParser()
+        self.embedder = EmbeddingGenerator()
+        self.vector_store = QdrantVectorStore(
+            embedding_dim=self.embedder.get_embedding_dim(),
+            index_path=index_path or "./data/faiss/codebase_index",
+            persist=False,
+        )
+        self.hybrid_search = HybridSearchEngine(
+            reranker_model=os.getenv(
+                "RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2"
+            )
+        )
+        self.app_env = os.getenv("APP_ENV", os.getenv("ENVIRONMENT", "local")).lower()
+        self.llm_provider = os.getenv("LLM_PROVIDER", "vertex_ai").lower()
+        self.llm_client = None
+        self.llm_model = ""
+        self._configure_llm()
+        self.session_ttl_minutes = int(os.getenv("SESSION_TTL_MINUTES", "120"))
+        self.indexing_progress: Dict[int, dict] = {}
+        self.repo_chunks: Dict[int, List[dict]] = {}
+        self.cancelled_repo_ids = set()
+        self.rebuild_indexes()
+    def rebuild_indexes(self):
+        session = get_db_session(self.database_url)
+        try:
+            self.vector_store.clear()
+            self.repo_chunks.clear()
+            self.indexing_progress.clear()
+            self.cancelled_repo_ids.clear()
+            repos = session.query(Repository).all()
+            self._delete_repositories(session, repos, track_cancellation=False)
+            self.cancelled_repo_ids.clear()
+            session.commit()
+        finally:
+            session.close()
+    def create_or_reset_repository(self, github_url: str, session_key: str) -> Repository:
+        info = self.repo_fetcher.parse_github_url(github_url)
+        registry_key = self._build_registry_key(session_key, github_url)
+        session = get_db_session(self.database_url)
+        try:
+            self._cleanup_expired_sessions(session)
+            repo = session.query(Repository).filter_by(github_url=registry_key).first()
+            if repo is None:
+                repo = Repository(
+                    github_url=registry_key,
+                    source_url=github_url,
+                    session_key=session_key,
+                    session_expires_at=self._session_expiry(),
+                    owner=info["owner"],
+                    name=info["repo"],
+                    branch=info["branch"],
+                    status="queued",
+                )
+                session.add(repo)
+                session.flush()
+                self.cancelled_repo_ids.discard(repo.id)
+            else:
+                repo.source_url = github_url
+                repo.session_key = session_key
+                repo.session_expires_at = self._session_expiry()
+                repo.owner = info["owner"]
+                repo.name = info["repo"]
+                repo.branch = info["branch"]
+                repo.status = "queued"
+                repo.error_message = None
+                repo.file_count = 0
+                repo.chunk_count = 0
+                repo.indexed_at = None
+                self.cancelled_repo_ids.discard(repo.id)
+                self.hybrid_search.remove_repository(repo.id)
+                self.vector_store.remove_repository(repo.id)
+                self.repo_chunks.pop(repo.id, None)
+            session.commit()
+            session.refresh(repo)
+            return repo
+        finally:
+            session.close()
+    def index_repository(self, repo_id: int):
+        session = get_db_session(self.database_url)
+        try:
+            self._cleanup_expired_sessions(session)
+            repo = session.query(Repository).filter_by(id=repo_id).first()
+            if repo is None:
+                raise ValueError("Repository not found")
+            self._ensure_repo_not_cancelled(repo.id)
+            print(f"[indexing] Starting repository index repo_id={repo.id}", flush=True)
+            repo.status = "indexing"
+            repo.error_message = None
+            repo.session_expires_at = self._session_expiry()
+            session.commit()
+            self._set_progress(repo.id, phase="cloning", message="Cloning repository")
+            clone_info = self.repo_fetcher.clone_repository(repo.source_url or repo.github_url)
+            self._ensure_repo_not_cancelled(repo.id)
+            repo.local_path = None
+            repo.branch = clone_info["branch"]
+            print(
+                f"[indexing] Repository cloned repo_id={repo.id} branch={repo.branch} "
+                f"path={clone_info['local_path']}",
+                flush=True,
+            )
+            source_files = list(self.repo_fetcher.iter_source_files(clone_info["local_path"]))
+            total_files = len(source_files)
+            print(
+                f"[indexing] Found {total_files} source files for repo_id={repo.id}",
+                flush=True,
+            )
+            self._set_progress(
+                repo.id,
+                phase="parsing",
+                message=f"Scanning {total_files} source files",
+                total_files=total_files,
+                processed_files=0,
+                discovered_chunks=0,
+            )
+            chunk_payloads = []
+            file_count = 0
+            for index, file_path in enumerate(source_files, start=1):
+                file_chunks = self.parser.chunk_file(str(file_path), clone_info["local_path"])
+                if not file_chunks:
+                    self._set_progress(
+                        repo.id,
+                        phase="parsing",
+                        message=f"Parsed {index}/{total_files} files",
+                        total_files=total_files,
+                        processed_files=index,
+                        discovered_chunks=len(chunk_payloads),
+                    )
+                    continue
+                file_count += 1
+                chunk_payloads.extend(file_chunks)
+                self._set_progress(
+                    repo.id,
+                    phase="parsing",
+                    message=f"Parsed {index}/{total_files} files",
+                    total_files=total_files,
+                    processed_files=index,
+                    discovered_chunks=len(chunk_payloads),
+                )
+            searchable_texts = [chunk["searchable_text"] for chunk in chunk_payloads]
+            print(
+                f"[indexing] Parsed repo_id={repo.id} files={file_count} chunks={len(searchable_texts)}",
+                flush=True,
+            )
+            self._set_progress(
+                repo.id,
+                phase="embedding",
+                message=f"Embedding {len(searchable_texts)} chunks",
+                total_files=total_files,
+                processed_files=total_files,
+                discovered_chunks=len(chunk_payloads),
+                total_chunks=len(chunk_payloads),
+                embedded_chunks=0,
+            )
+            embeddings = self.embedder.embed_batch(
+                searchable_texts,
+                progress_callback=lambda completed, total: self._set_progress(
+                    repo.id,
+                    phase="embedding",
+                    message=f"Embedding chunks ({completed}/{total})",
+                    total_files=total_files,
+                    processed_files=total_files,
+                    discovered_chunks=len(chunk_payloads),
+                    total_chunks=total,
+                    embedded_chunks=completed,
+                ),
+            )
+            self._ensure_repo_not_cancelled(repo.id)
+            vector_metadata = []
+            for chunk in chunk_payloads:
+                vector_metadata.append(
+                    {
+                        "repository_id": repo.id,
+                        "file_path": chunk["file_path"],
+                        "language": chunk["language"],
+                        "symbol_name": chunk["symbol_name"],
+                        "symbol_type": chunk["symbol_type"],
+                        "line_start": chunk["line_start"],
+                        "line_end": chunk["line_end"],
+                        "signature": chunk["signature"],
+                        "content": chunk["content"],
+                    }
+                )
+            embedding_ids = self.vector_store.add_embeddings(embeddings, vector_metadata)
+            print(
+                f"[indexing] Uploaded {len(embedding_ids)} embeddings to vector store for repo_id={repo.id}",
+                flush=True,
+            )
+            self._set_progress(
+                repo.id,
+                phase="saving",
+                message="Saving chunks and search indexes",
+                total_files=total_files,
+                processed_files=total_files,
+                discovered_chunks=len(chunk_payloads),
+            )
+            created_rows = []
+            for chunk, embedding_id in zip(chunk_payloads, embedding_ids):
+                row = {
+                    **chunk,
+                    "id": embedding_id,
+                    "repository_id": repo.id,
+                    "embedding_id": embedding_id,
+                }
+                created_rows.append(row)
+            repo.status = "indexed"
+            repo.file_count = file_count
+            repo.chunk_count = len(created_rows)
+            repo.indexed_at = datetime.utcnow()
+            repo.session_expires_at = self._session_expiry()
+            self._ensure_repo_still_exists(session, repo.id)
+            self._ensure_repo_not_cancelled(repo.id)
+            session.commit()
+            serialized = [self._serialize_chunk(chunk) for chunk in created_rows]
+            self.repo_chunks[repo.id] = serialized
+            self.vector_store.save()
+            self.indexing_progress.pop(repo.id, None)
+            self.cancelled_repo_ids.discard(repo.id)
+            self.repo_fetcher.cleanup_repository(clone_info["local_path"])
+            print(f"[indexing] Repository index complete repo_id={repo.id}", flush=True)
+        except Exception as exc:
+            print(f"[indexing] Repository index failed repo_id={repo_id} error={exc}", flush=True)
+            session.rollback()
+            self.vector_store.remove_repository(repo_id)
+            self.repo_chunks.pop(repo_id, None)
+            self.hybrid_search.remove_repository(repo_id)
+            repo = session.query(Repository).filter_by(id=repo_id).first()
+            if repo:
+                if repo_id in self.cancelled_repo_ids:
+                    session.delete(repo)
+                else:
+                    repo.status = "failed"
+                    repo.error_message = str(exc)
+                session.commit()
+            try:
+                if "clone_info" in locals():
+                    self.repo_fetcher.cleanup_repository(clone_info["local_path"])
+            except Exception:
+                pass
+            self.indexing_progress.pop(repo_id, None)
+            if isinstance(exc, SessionCancelledError):
+                return
+            raise
+        finally:
+            session.close()
+    def list_repositories(self) -> List[dict]:
+        raise NotImplementedError
+    def list_repositories_for_session(self, session_key: str) -> List[dict]:
+        session = get_db_session(self.database_url)
+        try:
+            self._cleanup_expired_sessions(session)
+            repos = (
+                session.query(Repository)
+                .filter_by(session_key=session_key)
+                .order_by(Repository.updated_at.desc())
+                .all()
+            )
+            self._touch_session(session, session_key)
+            return [self._serialize_repo(repo) for repo in repos]
+        finally:
+            session.close()
+    def get_repository(self, repo_id: int) -> Optional[dict]:
+        raise NotImplementedError
+    def get_repository_for_session(self, repo_id: int, session_key: str) -> Optional[dict]:
+        session = get_db_session(self.database_url)
+        try:
+            self._cleanup_expired_sessions(session)
+            repo = (
+                session.query(Repository)
+                .filter_by(id=repo_id, session_key=session_key)
+                .first()
+            )
+            self._touch_session(session, session_key)
+            return self._serialize_repo(repo) if repo else None
+        finally:
+            session.close()
+    def answer_question(
+        self,
+        repo_id: int,
+        session_key: str,
+        question: str,
+        top_k: int = 8,
+        history: Optional[List[object]] = None,
+    ) -> dict:
+        session = get_db_session(self.database_url)
+        try:
+            self._cleanup_expired_sessions(session)
+            repo = (
+                session.query(Repository)
+                .filter_by(id=repo_id, session_key=session_key)
+                .first()
+            )
+            if repo is None:
+                raise ValueError("Repository not found")
+            if repo.status != "indexed":
+                raise ValueError("Repository is not ready for questions yet")
+            if repo_id not in self.repo_chunks:
+                raise ValueError("Session cache expired. Re-index the repository and try again.")
+            self._touch_session(session, session_key)
+            normalized_history = self._normalize_history(history or [])
+            question_intent = self._question_intent(question)
+            search_depth = top_k * 4 if question_intent in {"api", "implementation", "cross_file", "setup"} else top_k * 2
+            retrieval_query = self._build_retrieval_query(question, normalized_history)
+            query_embedding = self.embedder.embed_text(retrieval_query)
+            semantic_hits = []
+            for score, meta in self.vector_store.search(query_embedding, k=search_depth, repo_filter=repo_id):
+                serialized = dict(meta)
+                serialized["semantic_score"] = score
+                semantic_hits.append(serialized)
+            lexical_hits = self.hybrid_search.bm25_search(
+                self.repo_chunks[repo_id],
+                retrieval_query,
+                top_k=search_depth,
+            )
+            semantic_hits = self.hybrid_search.normalize_semantic_results(semantic_hits)
+            fused = self.hybrid_search.reciprocal_rank_fusion(lexical_hits, semantic_hits, top_k=search_depth)
+            rerank_query = retrieval_query if question_intent in {"api", "implementation", "cross_file", "setup"} else question
+            reranked = self.hybrid_search.rerank(rerank_query, fused, top_k=search_depth)
+            reranked = self._prioritize_results(question, retrieval_query, reranked, top_k=top_k)
+            reranked = self._select_answer_sources(question, reranked, top_k=top_k)
+            answer = self._generate_answer(repo, question, reranked, normalized_history)
+            return answer
+        finally:
+            session.close()
+    def end_session(self, session_key: str):
+        session = get_db_session(self.database_url)
+        try:
+            repos = session.query(Repository).filter_by(session_key=session_key).all()
+            self._delete_repositories(session, repos)
+            session.commit()
+        finally:
+            session.close()
+    def _generate_answer(
+        self,
+        repo: Repository,
+        question: str,
+        sources: List[dict],
+        history: Optional[List[dict]] = None,
+    ) -> dict:
+        if not sources:
+            return {
+                "answer": "I could not find enough grounded evidence in the indexed codebase to answer that confidently.",
+                "confidence": "low",
+                "sources": [],
+                "repo": self._serialize_repo(repo),
+            }
+        context_blocks = []
+        slim_sources = []
+        for index, source in enumerate(sources, start=1):
+            context_blocks.append(
+                "\n".join(
+                    [
+                        f"[Source {index}]",
+                        f"File: {source['file_path']}",
+                        f"Symbol: {source['symbol_name']}",
+                        f"Lines: {source['line_start']}-{source['line_end']}",
+                        source["content"][:2500],
+                    ]
+                )
+            )
+            slim_sources.append(
+                {
+                    "file_path": source["file_path"],
+                    "language": source["language"],
+                    "symbol_name": source["symbol_name"],
+                    "symbol_type": source["symbol_type"],
+                    "line_start": source["line_start"],
+                    "line_end": source["line_end"],
+                    "signature": source["signature"],
+                    "snippet": source["content"],
+                    "semantic_score": round(float(source.get("semantic_score", 0.0)), 4),
+                    "bm25_score": round(float(source.get("bm25_score", 0.0)), 4),
+                    "rrf_score": round(float(source.get("rrf_score", 0.0)), 4),
+                    "rerank_score": round(float(source.get("rerank_score", 0.0)), 4),
+                }
+            )
+        wants_repo_overview = self._is_repo_overview_question(question)
+        question_intent = self._question_intent(question)
+        system_prompt = """
+You are answering questions as a knowledgeable teammate who has carefully read this repository.
+Rules:
+1. Use only the supplied repository context.
+2. Answer conversationally and directly, as if the repo is explaining itself to the user.
+3. Do not say "Based on the provided context", "The repository is about", or similar throat-clearing phrases.
+4. Be concrete about files, functions, and behavior.
+5. If evidence is partial, clearly separate what is certain from what is inferred.
+6. Respond in Markdown, not JSON.
+7. Keep the answer complete. Do not stop mid-sentence.
+8. Use short sections or bullets only when they genuinely help readability.
+9. Do not leave unfinished headings, dangling bullets, or trailing markdown markers like #, ##, or ###.
+10. Do not include inline citation markers like [Source 1] in the prose. The UI already shows sources separately.
+11. Do not make claims that are not directly supported by the supplied sources.
+12. Prefer the most canonical source files for API and implementation questions, such as package exports, core modules, and session/query code, over tutorial prose when they disagree in specificity.
+13. Keep the answer tight. Lead with the direct answer, then add only the most important supporting detail.
+"""
+        if wants_repo_overview:
+            system_prompt += """
+14. For repository overview questions, lead with a direct one or two sentence summary of what the repo does.
+15. Prioritize README and top-level documentation when they are present, then use code to support the explanation.
+16. Mention the main workflow, core stack, and any important product constraints the user would care about.
+17. Keep the answer polished and self-contained, like the overview a real user expects when they ask what a repo is about.
+"""
+        elif question_intent in {"api", "implementation", "cross_file", "error_handling", "setup"}:
+            system_prompt += """
+14. For API, implementation, setup, and cross-file questions, prefer the smallest correct answer that is directly supported by code.
+15. If a detail comes only from docs or examples and not from the canonical implementation, say that clearly instead of presenting it as core behavior.
+16. When describing exports or code paths, name the file first and keep the explanation precise.
+17. Default to one short paragraph plus at most 3 short bullets. Avoid long explanatory walkthroughs unless the question explicitly asks for depth.
+"""
+        joined_context = "\n\n".join(context_blocks)
+        user_prompt = f"""
+Repository: {repo.owner}/{repo.name}
+Question: {question}
+Recent conversation:
+{self._format_history(history or [])}
+Context:
+{joined_context}
+"""
+        answer_text, finish_reason = self._generate_markdown_response(system_prompt, user_prompt)
+        if self._looks_incomplete(answer_text, finish_reason):
+            repair_prompt = f"""
+The draft answer below appears to be cut off or incomplete.
+Rewrite it into a complete final answer using the same repository context and rules.
+Draft answer:
+{answer_text}
+"""
+            answer_text, finish_reason = self._generate_markdown_response(
+                system_prompt,
+                f"{user_prompt.strip()}\n\n{repair_prompt.strip()}",
+            )
+            if self._looks_incomplete(answer_text, finish_reason):
+                short_prompt = f"""
+Answer the question again, but keep it concise and complete.
+Use 2 short paragraphs or 4-6 bullets max.
+Do not leave the answer unfinished.
+"""
+                answer_text, _ = self._generate_markdown_response(
+                    system_prompt,
+                    f"{user_prompt.strip()}\n\n{short_prompt.strip()}",
+                )
+        answer_text = self._finalize_answer(answer_text)
+        confidence = self._estimate_confidence(sources)
+        summary = " ".join(answer_text.split())[:160] if answer_text else ""
+        citations = [
+            {
+                "source": index,
+                "reason": f"Relevant context from {source['file_path']}",
+            }
+            for index, source in enumerate(sources[: min(len(sources), 4)], start=1)
+        ]
+        return {
+            "answer": answer_text,
+            "confidence": confidence,
+            "summary": summary,
+            "citations": citations,
+            "sources": slim_sources,
+            "repo": self._serialize_repo(repo),
+        }
+    def _configure_llm(self):
+        if self.llm_provider == "groq":
+            self.llm_client = OpenAI(
+                api_key=os.getenv("GROQ_API_KEY"),
+                base_url=os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1"),
+            )
+            self.llm_model = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
+            return
+        if self.llm_provider == "vertex_ai":
+            try:
+                from google import genai
+            except ImportError as exc:
+                raise RuntimeError(
+                    "Vertex AI LLM support requires the `google-genai` package. "
+                    "Install server dependencies before running local or eval queries."
+                ) from exc
+            project = os.getenv("GOOGLE_CLOUD_PROJECT")
+            location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
+            if not project:
+                raise RuntimeError(
+                    "GOOGLE_CLOUD_PROJECT must be set when using Vertex AI Gemini."
+                )
+            self.llm_client = genai.Client(
+                vertexai=True,
+                project=project,
+                location=location,
+            )
+            self.llm_model = os.getenv("VERTEX_LLM_MODEL", "gemini-2.5-pro")
+            return
+        raise RuntimeError(f"Unsupported LLM provider: {self.llm_provider}")
+    def _generate_markdown_response(self, system_prompt: str, user_prompt: str) -> tuple[str, str]:
+        if self.llm_provider == "groq":
+            response = self.llm_client.chat.completions.create(
+                model=self.llm_model,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ],
+                temperature=0.1,
+                max_tokens=1600,
+            )
+            content = response.choices[0].message.content
+            finish_reason = getattr(response.choices[0], "finish_reason", "") or ""
+            return self._normalize_markdown_answer(content), str(finish_reason)
+        response = self.llm_client.models.generate_content(
+            model=self.llm_model,
+            contents=f"{system_prompt.strip()}\n\n{user_prompt.strip()}",
+            config={
+                "temperature": 0.1,
+                "max_output_tokens": 2200,
+            },
+        )
+        if not getattr(response, "text", None):
+            raise RuntimeError("Vertex AI Gemini returned an empty response.")
+        finish_reason = ""
+        candidates = getattr(response, "candidates", None) or []
+        if candidates:
+            finish_reason = str(getattr(candidates[0], "finish_reason", "") or "")
+        return self._normalize_markdown_answer(response.text), finish_reason
+    @staticmethod
+    def _normalize_markdown_answer(raw_text: str) -> str:
+        cleaned = (raw_text or "").strip()
+        cleaned = re.sub(r"^```(?:markdown|md)?\s*|\s*```$", "", cleaned, flags=re.IGNORECASE)
+        cleaned = re.sub(r"\s*\[(?:Source\s+\d+(?:\s*,\s*Source\s+\d+)*)\]", "", cleaned, flags=re.IGNORECASE)
+        cleaned = re.sub(
+            r"^(?:based on the provided context[,:\s-]*|from the provided context[,:\s-]*)",
+            "",
+            cleaned,
+            flags=re.IGNORECASE,
+        ).strip()
+        cleaned = re.sub(
+            r"\n(?:#{1,6}|[-*])\s*$",
+            "",
+            cleaned,
+            flags=re.MULTILINE,
+        ).strip()
+        cleaned = re.sub(r"(?:\n\s*){3,}", "\n\n", cleaned)
+        cleaned = cleaned.strip()
+        if not cleaned:
+            return "I found relevant code context, but the model returned an empty response."
+        return cleaned
+    @staticmethod
+    def _finalize_answer(answer_text: str) -> str:
+        cleaned = (answer_text or "").strip()
+        if not cleaned:
+            return "I found relevant code context, but the model returned an empty response."
+        # If the tail still looks truncated, trim back to the last complete sentence or list item
+        if CodebaseRAGSystem._looks_incomplete(cleaned):
+            sentence_match = re.search(r"(?s)^.*[.!?](?:['\"\)`\]]+)?", cleaned)
+            if sentence_match:
+                trimmed = sentence_match.group(0).strip()
+                if len(trimmed.split()) >= 12:
+                    return trimmed
+            lines = cleaned.splitlines()
+            while lines and CodebaseRAGSystem._looks_incomplete(lines[-1]):
+                lines.pop()
+            candidate = "\n".join(line for line in lines if line.strip()).strip()
+            if candidate:
+                return candidate
+        return cleaned
+    @staticmethod
+    def _looks_incomplete(answer_text: str, finish_reason: str = "") -> bool:
+        cleaned = (answer_text or "").strip()
+        if not cleaned:
+            return True
+        finish_reason = (finish_reason or "").strip().lower()
+        if finish_reason and finish_reason not in {"stop", "stopsequence", "finish_reason_unspecified"}:
+            return True
+        if cleaned.endswith(("#", "-", "*", ":", "(", "[", "/", "`")):
+            return True
+        if cleaned.endswith(("[source", "[source 1", "[source 2", "[source 3", "[source 4")):
+            return True
+        if cleaned.count("```") % 2 != 0:
+            return True
+        if cleaned.count("(") > cleaned.count(")"):
+            return True
+        if cleaned.count("[") > cleaned.count("]"):
+            return True
+        tokens = re.findall(r"\b[\w'-]+\b", cleaned.lower())
+        if not tokens:
+            return True
+        if tokens[-1] in {"a", "an", "the", "to", "for", "with", "of", "in", "on", "from", "about"}:
+            return True
+        if len(tokens) >= 20 and cleaned[-1] not in {".", "!", "?", "\"", "'", "`"}:
+            return True
+        return False
+    @staticmethod
+    def _estimate_confidence(sources: List[dict]) -> str:
+        if not sources:
+            return "low"
+        top = sources[0]
+        rerank = float(top.get("rerank_score", 0.0))
+        semantic = float(top.get("semantic_score", 0.0))
+        if len(sources) >= 3 and (rerank >= 0.2 or semantic >= 0.75):
+            return "high"
+        if rerank >= 0.05 or semantic >= 0.45:
+            return "medium"
+        return "low"
+    def _serialize_repo(self, repo: Repository) -> dict:
+        payload = {
+            "id": repo.id,
+            "github_url": repo.source_url or repo.github_url,
+            "owner": repo.owner,
+            "name": repo.name,
+            "branch": repo.branch,
+            "local_path": repo.local_path,
+            "status": repo.status,
+            "error_message": repo.error_message,
+            "file_count": repo.file_count,
+            "chunk_count": repo.chunk_count,
+            "indexed_at": repo.indexed_at.isoformat() if repo.indexed_at else None,
+            "created_at": repo.created_at.isoformat() if repo.created_at else None,
+            "updated_at": repo.updated_at.isoformat() if repo.updated_at else None,
+        }
+        progress = self.indexing_progress.get(repo.id)
+        if progress:
+            payload["progress"] = progress
+        return payload
+    def _set_progress(self, repo_id: int, **progress):
+        self.indexing_progress[repo_id] = {
+            **self.indexing_progress.get(repo_id, {}),
+            **progress,
+            "updated_at": datetime.utcnow().isoformat(),
+        }
+    def _touch_session(self, session, session_key: str):
+        expiry = self._session_expiry()
+        repos = session.query(Repository).filter_by(session_key=session_key).all()
+        for repo in repos:
+            repo.session_expires_at = expiry
+        session.commit()
+    def _cleanup_expired_sessions(self, session):
+        now = datetime.utcnow()
+        expired = (
+            session.query(Repository)
+            .filter(Repository.session_expires_at.is_not(None))
+            .filter(Repository.session_expires_at < now)
+            .all()
+        )
+        if not expired:
+            return
+        self._delete_repositories(session, expired)
+        session.commit()
+    def _delete_repositories(
+        self,
+        session,
+        repos: List[Repository],
+        track_cancellation: bool = True,
+    ):
+        repo_ids = [repo.id for repo in repos]
+        for repo_id in repo_ids:
+            if track_cancellation:
+                self.cancelled_repo_ids.add(repo_id)
+            self.hybrid_search.remove_repository(repo_id)
+            self.vector_store.remove_repository(repo_id)
+            self.repo_chunks.pop(repo_id, None)
+            self.indexing_progress.pop(repo_id, None)
+        for repo in repos:
+            session.delete(repo)
+    def _ensure_repo_not_cancelled(self, repo_id: int):
+        if repo_id in self.cancelled_repo_ids:
+            raise SessionCancelledError("Session ended before indexing completed.")
+    def _build_retrieval_query(self, question: str, history: List[dict]) -> str:
+        normalized = " ".join(question.strip().split())
+        if self._is_repo_overview_question(normalized):
+            return "\n".join(
+                [
+                    normalized,
+                    "repository overview purpose main workflow architecture README features stack",
+                ]
+            )
+        if not history:
+            return normalized
+        recent_user = [
+            turn["content"].strip()
+            for turn in reversed(history)
+            if turn.get("role") == "user" and turn.get("content", "").strip()
+        ]
+        recent_assistant = [
+            turn["content"].strip()
+            for turn in reversed(history)
+            if turn.get("role") == "assistant" and turn.get("content", "").strip()
+            and self._is_substantive_assistant_message(turn.get("content", ""))
+        ]
+        is_follow_up = (
+            len(normalized.split()) <= 6
+            or bool(re.fullmatch(r"(give|show|where|which|how|what)(?:\s+.+)?", normalized.lower()))
+            or any(token in normalized.lower() for token in {"code", "snippet", "implementation"})
+        )
+        if not is_follow_up or not recent_user:
+            return self._expand_query_for_intent(normalized)
+        parts = [self._expand_query_for_intent(normalized)]
+        if recent_user:
+            parts.append(f"Follow-up to: {recent_user[0]}")
+        if recent_assistant:
+            parts.append(f"Previous answer: {recent_assistant[0][:300]}")
+        return "\n".join(parts)
+    def _prioritize_results(
+        self,
+        question: str,
+        retrieval_query: str,
+        results: List[dict],
+        top_k: int,
+    ) -> List[dict]:
+        combined_query = f"{question} {retrieval_query}".lower()
+        wants_code = any(
+            token in combined_query
+            for token in {"code", "snippet", "implementation", "function", "class", "import"}
+        )
+        wants_docs = self._is_documentation_query(combined_query)
+        wants_repo_overview = self._is_repo_overview_question(question) or self._is_repo_overview_question(
+            retrieval_query
+        )
+        question_intent = self._question_intent(question)
+        def sort_key(item: dict):
+            is_doc = self._is_doc_source(item)
+            return (
+                self._canonical_path_priority(item, question),
+                self._doc_priority(item),
+                1 if wants_repo_overview and is_doc else 0,
+                1 if (wants_docs and is_doc) or (not wants_docs and not is_doc) else 0,
+                1 if wants_code and not is_doc else 0,
+                1 if question_intent in {"api", "implementation", "cross_file", "error_handling", "setup"} and not is_doc else 0,
+                float(item.get("rerank_score", 0.0)),
+                float(item.get("semantic_score", 0.0)),
+                float(item.get("bm25_score", 0.0)),
+            )
+        ranked = sorted(results, key=sort_key, reverse=True)
+        if wants_docs or wants_repo_overview:
+            return ranked[:top_k]
+        selected = []
+        doc_items = []
+        for item in ranked:
+            if self._is_doc_source(item):
+                doc_items.append(item)
+                continue
+            selected.append(item)
+            if len(selected) == top_k:
+                return selected
+        selected.extend(doc_items[: max(1, top_k - len(selected))])
+        return selected[:top_k]
+    def _select_answer_sources(
+        self,
+        question: str,
+        results: List[dict],
+        top_k: int,
+    ) -> List[dict]:
+        if not results:
+            return []
+        intent = self._question_intent(question)
+        max_per_file = 2 if intent in {"overview", "docs"} else 1
+        selected = []
+        file_counts = {}
+        for item in results:
+            file_path = item.get("file_path", "")
+            count = file_counts.get(file_path, 0)
+            if count >= max_per_file:
+                continue
+            selected.append(item)
+            file_counts[file_path] = count + 1
+            if len(selected) == top_k:
+                break
+        if len(selected) < top_k:
+            for item in results:
+                if item in selected:
+                    continue
+                selected.append(item)
+                if len(selected) == top_k:
+                    break
+        return selected
+    @staticmethod
+    def _is_documentation_query(query: str) -> bool:
+        return any(
+            token in query
+            for token in {
+                "readme",
+                "docs",
+                "documentation",
+                "setup",
+                "install",
+                "installation",
+                "usage",
+                "overview",
+                "what is this repo",
+                "what is the repository about",
+                "what is the repo about",
+                "what does the repo do",
+                "what does this repo do",
+                "repo summary",
+                "repository summary",
+                "project summary",
+                "feature",
+                "features",
+                "architecture",
+            }
+        )
+    @staticmethod
+    def _question_intent(question: str) -> str:
+        normalized = " ".join((question or "").lower().split())
+        if not normalized:
+            return "general"
+        if CodebaseRAGSystem._is_repo_overview_question(normalized):
+            return "overview"
+        if any(token in normalized for token in {"error", "invalid", "conflict", "raises", "guard against"}):
+            return "error_handling"
+        if any(token in normalized for token in {"how are", "how does", "flow", "across files", "code path"}):
+            return "cross_file"
+        if any(token in normalized for token in {"export", "expose", "import", "public api"}):
+            return "api"
+        if any(token in normalized for token in {"create", "setup", "install", "configuration", "metadata", "table"}):
+            return "setup"
+        if any(token in normalized for token in {"function", "method", "class", "implementation", "does ", "what is special"}):
+            return "implementation"
+        if CodebaseRAGSystem._is_documentation_query(normalized):
+            return "docs"
+        return "general"
+    def _expand_query_for_intent(self, question: str) -> str:
+        normalized = " ".join((question or "").split())
+        lowered = normalized.lower()
+        hints = []
+        if any(token in lowered for token in {"export", "expose", "import"}):
+            hints.extend(["package exports", "__init__.py", "public api", "re-export"])
+        if "how is select exposed to users in sqlmodel" in lowered:
+            hints.extend(
+                [
+                    "sqlmodel/__init__.py",
+                    "sqlmodel/sql/expression.py",
+                    "select re-export",
+                    "top-level select import",
+                ]
+            )
+        if "select" in lowered:
+            hints.extend(
+                [
+                    "select",
+                    "expression",
+                    "query builder",
+                    "public api",
+                    "sqlmodel/sql/expression.py",
+                    "sqlmodel/__init__.py",
+                    "re-export",
+                    "top-level import",
+                ]
+            )
+        if "session.exec" in lowered or ("session" in lowered and "exec" in lowered):
+            hints.extend(["session exec", "orm/session.py", "asyncio/session.py"])
+        if "relationship" in lowered:
+            hints.extend(["relationship", "Relationship", "main.py"])
+        if "field" in lowered:
+            hints.extend(["Field", "FieldInfo", "main.py"])
+        if "create_engine" in lowered:
+            hints.extend(["create_engine", "__init__.py", "re-export"])
+        if "create_all" in lowered or "metadata" in lowered:
+            hints.extend(
+                [
+                    "metadata create_all",
+                    "table creation",
+                    "engine",
+                    "SQLModel.metadata",
+                    "README.md",
+                    "sqlmodel/main.py",
+                    "docs_src",
+                ]
+            )
+        if "__init__" in lowered or "exports" in lowered:
+            hints.extend(["sqlmodel/__init__.py", "package exports", "public api"])
+        if not hints:
+            return normalized
+        return "\n".join([normalized, " ".join(hints)])
+    @staticmethod
+    def _is_repo_overview_question(question: str) -> bool:
+        normalized = " ".join((question or "").lower().split())
+        return any(
+            phrase in normalized
+            for phrase in {
+                "what is the repo about",
+                "what is this repo about",
+                "what does the repo do",
+                "what does this repo do",
+                "what is the repository about",
+                "what does the repository do",
+                "what is this project about",
+                "what does this project do",
+                "repo summary",
+                "repository summary",
+                "project summary",
+                "summarize the repo",
+                "summarize this repo",
+                "repo overview",
+                "repository overview",
+                "project overview",
+            }
+        )
+    @staticmethod
+    def _is_doc_source(item: dict) -> bool:
+        file_path = (item.get("file_path") or "").lower()
+        language = (item.get("language") or "").lower()
+        return language == "text" or file_path.endswith(".md") or "/readme" in file_path
+    @staticmethod
+    def _doc_priority(item: dict) -> int:
+        file_path = (item.get("file_path") or "").lower()
+        if file_path in {"readme.md", "readme"}:
+            return 3
+        if file_path.startswith("docs/") or "/docs/" in file_path:
+            return 2
+        if file_path.endswith(".md"):
+            return 1
+        return 0
+    def _canonical_path_priority(self, item: dict, question: str) -> int:
+        file_path = (item.get("file_path") or "").lower()
+        normalized = " ".join((question or "").lower().split())
+        score = 0
+        if file_path == "sqlmodel/__init__.py":
+            score += 4 if any(token in normalized for token in {"export", "expose", "import", "create_engine", "select"}) else 0
+        if file_path == "sqlmodel/sql/expression.py":
+            score += 5 if "select" in normalized else 0
+        if file_path == "sqlmodel/sql/_expression_select_gen.py":
+            score += 2 if "select" in normalized else 0
+        if file_path == "sqlmodel/sql/_expression_select_cls.py":
+            score += 2 if "select" in normalized else 0
+        if file_path == "readme.md":
+            score += 4 if any(token in normalized for token in {"metadata", "create_all", "workflow", "readme"}) else 0
+        if file_path.startswith("docs_src/"):
+            score += 3 if any(token in normalized for token in {"metadata", "create_all", "table", "workflow"}) else 0
+        if file_path == "sqlmodel/main.py":
+            score += 3 if any(token in normalized for token in {"field", "relationship", "metadata", "table", "sqlmodel"}) else 0
+        if "__init__.py" in file_path:
+            score += 2 if any(token in normalized for token in {"export", "expose", "import", "public api"}) else 0
+        if any(token in normalized for token in {"select", "expression"}):
+            if "expression" in file_path or "_expression_select" in file_path:
+                score += 3
+        if normalized == "how is select exposed to users in sqlmodel?":
+            if file_path == "sqlmodel/__init__.py":
+                score += 6
+            if file_path == "sqlmodel/sql/expression.py":
+                score += 6
+        if "session" in normalized:
+            if file_path.endswith("session.py") or "/session.py" in file_path:
+                score += 3
+        if "relationship" in normalized and file_path.endswith("main.py"):
+            score += 2
+        if "field" in normalized and file_path.endswith("main.py"):
+            score += 2
+        if any(token in normalized for token in {"create_engine", "export", "expose"}) and "__init__.py" in file_path:
+            score += 2
+        if any(token in normalized for token in {"metadata", "create_all", "table"}) and (
+            "docs_src/" in file_path or file_path.endswith("main.py") or file_path == "readme.md"
+        ):
+            score += 2
+        if self._is_doc_source(item) and self._question_intent(question) in {
+            "api",
+            "implementation",
+            "cross_file",
+            "error_handling",
+            "setup",
+        }:
+            score -= 1
+        return score
+    @staticmethod
+    def _is_substantive_assistant_message(content: str) -> bool:
+        normalized = " ".join((content or "").strip().lower().split())
+        if len(normalized) < 24:
+            return False
+        if normalized in {
+            "hey, what question do you have for me today?",
+            "ask a question",
+        }:
+            return False
+        return True
+    @staticmethod
+    def _normalize_history(history: List[object]) -> List[dict]:
+        normalized = []
+        for turn in history:
+            if isinstance(turn, dict):
+                role = turn.get("role")
+                content = turn.get("content")
+            else:
+                role = getattr(turn, "role", None)
+                content = getattr(turn, "content", None)
+            if not role or not content:
+                continue
+            normalized.append(
+                {
+                    "role": str(role),
+                    "content": str(content),
+                }
+            )
+        return normalized
+    @staticmethod
+    def _format_history(history: List[dict]) -> str:
+        if not history:
+            return "None"
+        lines = []
+        for turn in history[-4:]:
+            role = turn.get("role", "user").capitalize()
+            content = " ".join(turn.get("content", "").split())
+            if content:
+                lines.append(f"{role}: {content[:400]}")
+        return "\n".join(lines) if lines else "None"
+    @staticmethod
+    def _ensure_repo_still_exists(session, repo_id: int):
+        if session.query(Repository.id).filter_by(id=repo_id).first() is None:
+            raise RuntimeError("Repository was removed before indexing completed.")
+    def _session_expiry(self) -> datetime:
+        return datetime.utcnow() + timedelta(minutes=self.session_ttl_minutes)
+    @staticmethod
+    def _build_registry_key(session_key: str, github_url: str) -> str:
+        return f"{session_key}::{github_url}"
+    @staticmethod
+    def _serialize_chunk(chunk: dict) -> dict:
+        return {
+            "id": chunk["id"],
+            "file_path": chunk["file_path"],
+            "language": chunk["language"],
+            "symbol_name": chunk["symbol_name"],
+            "symbol_type": chunk["symbol_type"],
+            "line_start": chunk["line_start"],
+            "line_end": chunk["line_end"],
+            "signature": chunk["signature"],
+            "content": chunk["content"],
+            "searchable_text": chunk["searchable_text"],
+            "metadata_json": chunk.get("metadata_json") or {},
+        }

src/repo_fetcher.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+import re
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+from urllib.parse import urlparse
+SUPPORTED_EXTENSIONS = {
+    ".py",
+    ".js",
+    ".jsx",
+    ".ts",
+    ".tsx",
+    ".java",
+    ".go",
+    ".rs",
+    ".md",
+    ".json",
+    ".yml",
+    ".yaml",
+    ".toml",
+    ".sh",
+    ".css",
+    ".html",
+}
+IGNORED_FILENAMES = {
+    "package-lock.json",
+    "yarn.lock",
+    "pnpm-lock.yaml",
+    "bun.lockb",
+}
+IGNORED_DIRS = {
+    ".git",
+    ".next",
+    ".turbo",
+    "dist",
+    "build",
+    "coverage",
+    "node_modules",
+    "vendor",
+    ".venv",
+    "venv",
+    "__pycache__",
+}
+MAX_FILE_SIZE_BYTES = 250_000
+class RepoFetcher:
+    def __init__(self, base_dir: str = None):
+        repo_cache_dir = base_dir or os.getenv(
+            "REPO_CACHE_DIR",
+            str(Path(tempfile.gettempdir()) / "codecompass-repos"),
+        )
+        self.base_dir = Path(repo_cache_dir)
+        self.base_dir.mkdir(parents=True, exist_ok=True)
+    def parse_github_url(self, github_url: str) -> dict:
+        parsed = urlparse(github_url)
+        path = parsed.path.rstrip("/")
+        if parsed.netloc not in {"github.com", "www.github.com"}:
+            raise ValueError("Only github.com URLs are supported")
+        parts = [part for part in path.split("/") if part]
+        if len(parts) < 2:
+            raise ValueError("GitHub URL must include owner and repository name")
+        owner = parts[0]
+        repo = parts[1].removesuffix(".git")
+        branch = "main"
+        if len(parts) >= 4 and parts[2] in {"tree", "blob"}:
+            branch = parts[3]
+        slug = re.sub(r"[^a-zA-Z0-9_.-]+", "-", f"{owner}-{repo}")
+        repo_url = f"https://github.com/{owner}/{repo}"
+        return {
+            "owner": owner,
+            "repo": repo,
+            "branch": branch,
+            "slug": slug,
+            "repo_url": repo_url,
+        }
+    def clone_repository(self, github_url: str) -> dict:
+        info = self.parse_github_url(github_url)
+        target_dir = self.base_dir / info["slug"]
+        if target_dir.exists():
+            shutil.rmtree(target_dir)
+        clone_cmd = [
+            "git",
+            "clone",
+            "--depth",
+            "1",
+            "--branch",
+            info["branch"],
+            github_url,
+            str(target_dir),
+        ]
+        clone_cmd[6] = info["repo_url"]
+        result = subprocess.run(clone_cmd, capture_output=True, text=True)
+        if result.returncode != 0 and info["branch"] != "main":
+            info["branch"] = "main"
+            clone_cmd[5] = "main"
+            result = subprocess.run(clone_cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            default_branch = self._resolve_default_branch(info["repo_url"])
+            if default_branch and default_branch != info["branch"]:
+                info["branch"] = default_branch
+                clone_cmd[5] = default_branch
+                result = subprocess.run(clone_cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(result.stderr.strip() or "Failed to clone repository")
+        return {
+            **info,
+            "local_path": str(target_dir),
+        }
+    def _resolve_default_branch(self, github_url: str) -> str | None:
+        result = subprocess.run(
+            ["git", "ls-remote", "--symref", github_url, "HEAD"],
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            return None
+        for line in result.stdout.splitlines():
+            if line.startswith("ref: ") and "\tHEAD" in line:
+                ref = line.split("\t", 1)[0].removeprefix("ref: ").strip()
+                if ref.startswith("refs/heads/"):
+                    return ref.removeprefix("refs/heads/")
+        return None
+    def cleanup_repository(self, repo_path: str):
+        target = Path(repo_path)
+        if target.exists():
+            shutil.rmtree(target)
+    def iter_source_files(self, repo_path: str):
+        root = Path(repo_path)
+        for file_path in root.rglob("*"):
+            if not file_path.is_file():
+                continue
+            if any(part in IGNORED_DIRS for part in file_path.parts):
+                continue
+            if file_path.name in IGNORED_FILENAMES:
+                continue
+            if file_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
+                continue
+            if file_path.stat().st_size > MAX_FILE_SIZE_BYTES:
+                continue
+            yield file_path

src/vector_store.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+from typing import List, Optional, Tuple
+from uuid import uuid4
+import numpy as np
+from qdrant_client import QdrantClient, models
+class QdrantVectorStore:
+    def __init__(self, embedding_dim: int, index_path: str = None, persist: bool = False):
+        self.embedding_dim = embedding_dim
+        self.collection_name = os.getenv("QDRANT_COLLECTION", "repo_qa_chunks")
+        self.upsert_batch_size = max(1, int(os.getenv("QDRANT_UPSERT_BATCH_SIZE", "64")))
+        self.client = self._create_client()
+        self._ensure_collection()
+    def _create_client(self):
+        url = os.getenv("QDRANT_URL")
+        api_key = os.getenv("QDRANT_API_KEY")
+        timeout = int(os.getenv("QDRANT_TIMEOUT_SECONDS", "120"))
+        if url:
+            return QdrantClient(
+                url=url,
+                api_key=api_key,
+                timeout=timeout,
+                check_compatibility=False,
+            )
+        return QdrantClient(":memory:")
+    def _ensure_collection(self):
+        if not self.client.collection_exists(self.collection_name):
+            self.client.create_collection(
+                collection_name=self.collection_name,
+                vectors_config=models.VectorParams(
+                    size=self.embedding_dim,
+                    distance=models.Distance.COSINE,
+                ),
+            )
+        self._ensure_payload_indexes()
+    def _ensure_payload_indexes(self):
+        self.client.create_payload_index(
+            collection_name=self.collection_name,
+            field_name="repository_id",
+            field_schema=models.PayloadSchemaType.INTEGER,
+            wait=True,
+        )
+    def add_embeddings(self, embeddings: np.ndarray, metadata: List[dict]) -> List[int]:
+        if embeddings.size == 0:
+            return []
+        embeddings = embeddings.astype("float32")
+        if embeddings.ndim == 1:
+            embeddings = embeddings.reshape(1, -1)
+        ids = [uuid4().hex for _ in metadata]
+        points = []
+        for idx, meta, embedding in zip(ids, metadata, embeddings):
+            payload = dict(meta)
+            payload["id"] = idx
+            points.append(
+                models.PointStruct(
+                    id=idx,
+                    vector=embedding.tolist(),
+                    payload=payload,
+                )
+            )
+        total_points = len(points)
+        for start in range(0, total_points, self.upsert_batch_size):
+            batch = points[start : start + self.upsert_batch_size]
+            batch_number = (start // self.upsert_batch_size) + 1
+            total_batches = (total_points + self.upsert_batch_size - 1) // self.upsert_batch_size
+            print(
+                f"[qdrant] Upserting batch {batch_number}/{total_batches} "
+                f"points={len(batch)} progress={start}/{total_points}",
+                flush=True,
+            )
+            self.client.upsert(
+                collection_name=self.collection_name,
+                wait=True,
+                points=batch,
+            )
+        return ids
+    def search(
+        self,
+        query_embedding: np.ndarray,
+        k: int = 10,
+        repo_filter: Optional[int] = None,
+    ) -> List[Tuple[float, dict]]:
+        if query_embedding.ndim == 1:
+            query_embedding = query_embedding.reshape(1, -1)
+        query_embedding = query_embedding.astype("float32")
+        query_filter = None
+        if repo_filter is not None:
+            query_filter = models.Filter(
+                must=[
+                    models.FieldCondition(
+                        key="repository_id",
+                        match=models.MatchValue(value=repo_filter),
+                    )
+                ]
+            )
+        hits = self.client.search(
+            collection_name=self.collection_name,
+            query_vector=query_embedding[0].tolist(),
+            query_filter=query_filter,
+            limit=k,
+        )
+        return [(float(hit.score), dict(hit.payload or {})) for hit in hits]
+    def remove_repository(self, repo_id: int):
+        self.client.delete(
+            collection_name=self.collection_name,
+            wait=True,
+            points_selector=models.FilterSelector(
+                filter=models.Filter(
+                    must=[
+                        models.FieldCondition(
+                            key="repository_id",
+                            match=models.MatchValue(value=repo_id),
+                        )
+                    ]
+                )
+            ),
+        )
+    def clear(self):
+        if self.client.collection_exists(self.collection_name):
+            self.client.delete_collection(self.collection_name)
+        self._ensure_collection()
+    def save(self):
+        return None
+    def load(self):
+        self._ensure_collection()
+    def get_stats(self) -> dict:
+        info = self.client.get_collection(self.collection_name)
+        return {
+            "total_vectors": info.points_count or 0,
+            "embedding_dim": self.embedding_dim,
+            "collection_name": self.collection_name,
+        }