Spaces:

technophyle
/

code-compass

Runtime error

App Files Files Community

technophyle commited on 10 days ago

Commit

24f9940

verified ·

1 Parent(s): e84b903

Sync from GitHub via hub-sync

Browse files

Files changed (7) hide show

README.md +1 -1
evals/run_eval.py +234 -63
evals/sample_eval_set.json +53 -0
requirements.txt +1 -0
src/bedrock_claude.py +51 -0
src/embeddings.py +100 -69
src/rag_system.py +48 -37

README.md CHANGED Viewed

@@ -16,6 +16,6 @@ Behavior:
 - Clones a public GitHub repo
 - Chunks it with tree-sitter
 - Builds retrieval state with a Qdrant adapter
-- Answers questions with Groq-hosted Llama, AWS Bedrock, or Vertex AI Gemini depending on environment configuration
 - Deletes the cloned repo after indexing
 - Keeps only lightweight repo metadata in SQLite

 - Clones a public GitHub repo
 - Chunks it with tree-sitter
 - Builds retrieval state with a Qdrant adapter
+- Answers questions with Groq-hosted Llama or Amazon Bedrock Claude depending on environment configuration
 - Deletes the cloned repo after indexing
 - Keeps only lightweight repo metadata in SQLite

evals/run_eval.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import sys
 import asyncio
 import re
 from pathlib import Path
 from collections import Counter, defaultdict
 from statistics import mean
@@ -16,6 +17,7 @@ if str(SERVER_ROOT) not in sys.path:
 load_dotenv(SERVER_ROOT / ".env")
 from src.embeddings import EmbeddingGenerator
@@ -24,6 +26,8 @@ REPO_ID = int(os.getenv("CODEBASE_RAG_REPO_ID", "1"))
 SESSION_ID = os.getenv("CODEBASE_RAG_SESSION_ID", "eval-session")
 TOP_K = int(os.getenv("CODEBASE_RAG_TOP_K", "8"))
 QUERY_TIMEOUT_SECONDS = int(os.getenv("CODEBASE_RAG_QUERY_TIMEOUT_SECONDS", "180"))
 ENABLE_RAGAS = os.getenv("CODEBASE_RAG_ENABLE_RAGAS", "1").lower() not in {"0", "false", "no"}
 RAGAS_ASYNC = os.getenv("CODEBASE_RAG_RAGAS_ASYNC", "0").lower() in {"1", "true", "yes"}
 RAGAS_RAISE_EXCEPTIONS = os.getenv("CODEBASE_RAG_RAGAS_RAISE_EXCEPTIONS", "0").lower() in {
@@ -31,6 +35,8 @@ RAGAS_RAISE_EXCEPTIONS = os.getenv("CODEBASE_RAG_RAGAS_RAISE_EXCEPTIONS", "0").l
     "true",
     "yes",
 }
 EVAL_SET_PATH = Path(
     os.getenv(
         "CODEBASE_RAG_EVAL_SET",
@@ -43,6 +49,47 @@ def log(message: str):
     print(f"[eval] {message}", file=sys.stderr, flush=True)
 def load_eval_rows():
     return json.loads(EVAL_SET_PATH.read_text())
@@ -54,24 +101,51 @@ def post_query(row):
         "top_k": TOP_K,
         "history": row.get("turns", []),
     }
-    response = requests.post(
-        f"{API_URL}/api/query",
-        json=payload,
-        headers={"X-Session-Id": SESSION_ID},
-        timeout=QUERY_TIMEOUT_SECONDS,
-    )
-    if not response.ok:
         detail = response.text
         try:
             parsed = response.json()
             detail = parsed.get("detail") or parsed
         except Exception:
             pass
         raise RuntimeError(
-            f"Query failed for eval case {row.get('id', row['question'])!r} "
             f"with status {response.status_code}: {detail}"
         )
-    return response.json()
 def normalize_path(path: str) -> str:
@@ -115,6 +189,18 @@ def tokenize_text(text: str):
     return re.findall(r"[a-z0-9_./+-]+", (text or "").lower())
 def compute_retrieval_metrics(expected_sources, actual_sources):
     expected = {normalize_path(path) for path in expected_sources}
     actual = [normalize_path(path) for path in actual_sources]
@@ -165,24 +251,52 @@ def compute_retrieval_metrics(expected_sources, actual_sources):
     }
-def keyword_match_ratio(row, answer: str):
-    keywords = [keyword.lower() for keyword in row.get("must_include_any", []) if keyword.strip()]
     if not keywords:
         return None
-    lowered = answer.lower()
-    matched = sum(1 for keyword in keywords if keyword in lowered)
-    return matched / len(keywords)
-def keyword_pass(row, answer: str, coverage: float | None):
-    if coverage is None:
         return None
     minimum = int(row.get("min_keyword_matches", 1))
-    keywords = [keyword for keyword in row.get("must_include_any", []) if str(keyword).strip()]
-    if not keywords:
-        return None
-    matched = round(coverage * len(keywords))
-    return 1 if matched >= minimum else 0
 def answer_length_metrics(answer: str):
@@ -193,7 +307,7 @@ def answer_length_metrics(answer: str):
     }
-def lexical_overlap_ratio(reference: str, candidate: str):
     reference_terms = {
         token for token in tokenize_text(reference)
         if len(token) > 2 and token not in STOPWORDS
@@ -201,8 +315,23 @@ def lexical_overlap_ratio(reference: str, candidate: str):
     if not reference_terms:
         return None
     candidate_terms = set(tokenize_text(candidate))
-    matched = sum(1 for token in reference_terms if token in candidate_terms)
-    return matched / len(reference_terms)
 def validate_eval_rows(rows):
@@ -236,6 +365,13 @@ def validate_eval_rows(rows):
             errors.append(f"{row_id}: expected_sources must be a non-empty list")
         if must_include_any and not isinstance(must_include_any, list):
             errors.append(f"{row_id}: must_include_any must be a list when present")
         if row.get("turns"):
             conversation_cases += 1
         expected_source_counts.append(len(expected_sources) if isinstance(expected_sources, list) else 0)
@@ -279,12 +415,16 @@ def validate_eval_rows(rows):
 def summarize_custom_metrics(details):
     keyword_coverages = [item["keyword_coverage"] for item in details if item["keyword_coverage"] is not None]
     keyword_passes = [item["keyword_pass"] for item in details if item["keyword_pass"] is not None]
     grounded_answer_passes = [
         1
         for item in details
         if item["retrieval_hit"] == 1
         and item["has_substantive_answer"] == 1
         and (item["keyword_pass"] in {None, 1})
     ]
     exact_source_recall_cases = [1 for item in details if item["source_recall"] == 1.0]
     return {
@@ -296,6 +436,7 @@ def summarize_custom_metrics(details):
         "duplicate_source_rate": round(mean(item["duplicate_source_rate"] for item in details), 4),
         "keyword_coverage": round(mean(keyword_coverages), 4) if keyword_coverages else None,
         "keyword_pass_rate": round(mean(keyword_passes), 4) if keyword_passes else None,
         "ground_truth_lexical_overlap": round(
             mean(item["ground_truth_lexical_overlap"] for item in details if item["ground_truth_lexical_overlap"] is not None),
             4,
@@ -323,10 +464,23 @@ def summarize_by_category(details):
             "source_recall": round(mean(item["source_recall"] for item in items), 4),
             "mrr": round(mean(item["mrr"] for item in items), 4),
             "keyword_pass_rate": round(mean(keyword_passes), 4) if keyword_passes else None,
             "grounded_answer_rate": round(
                 mean(
                     1
-                    if item["retrieval_hit"] == 1 and item["has_substantive_answer"] == 1 and item["keyword_pass"] in {None, 1}
                     else 0
                     for item in items
                 ),
@@ -346,6 +500,7 @@ def build_headline_metrics(custom_metrics, audit):
         "source_recall": custom_metrics["source_recall"],
         "grounded_answer_rate": custom_metrics["grounded_answer_rate"],
         "keyword_pass_rate": custom_metrics["keyword_pass_rate"],
     }
@@ -361,9 +516,14 @@ def build_resume_summary(custom_metrics, audit, ragas_report, ragas_error):
             f"source recall {custom_metrics['source_recall']:.1%}."
         ),
         (
-            f"Answer quality checks: grounded answer rate {custom_metrics['grounded_answer_rate']:.1%}"
             + (
-                f", keyword/checklist pass rate {custom_metrics['keyword_pass_rate']:.1%}."
                 if custom_metrics["keyword_pass_rate"] is not None
                 else "."
             )
@@ -424,13 +584,12 @@ def maybe_write_report(report):
 def build_bedrock_ragas_llm(run_config):
-    import boto3
     from langchain_core.outputs import Generation, LLMResult
     from ragas.llms.base import BaseRagasLLM
     class BedrockRagasLLM(BaseRagasLLM):
-        def __init__(self, model: str, region: str, run_config):
-            self.client = boto3.client("bedrock-runtime", region_name=region)
             self.model = model
             self.set_run_config(run_config)
@@ -445,35 +604,19 @@ def build_bedrock_ragas_llm(run_config):
         def _generate_once(self, prompt, n=1, temperature=1e-8, stop=None, callbacks=None):
             prompt_text = self._prompt_to_text(prompt)
-            inference_config = {
-                "temperature": 0.0,
-                "maxTokens": int(os.getenv("EVAL_MAX_OUTPUT_TOKENS", "2048")),
-            }
-            if stop:
-                inference_config["stopSequences"] = stop
-            response = self.client.converse(
-                modelId=self.model,
-                messages=[
-                    {
-                        "role": "user",
-                        "content": [{"text": prompt_text}],
-                    }
-                ],
-                inferenceConfig=inference_config,
             )
-            generations = []
-            output_message = (response.get("output") or {}).get("message") or {}
-            content_blocks = output_message.get("content") or []
-            text = "".join(
-                block.get("text", "") for block in content_blocks if isinstance(block, dict)
-            ).strip()
-            if text:
-                generations.append(Generation(text=text))
             if not generations:
-                raise RuntimeError("AWS Bedrock judge returned an empty response.")
             return LLMResult(generations=[generations])
@@ -496,12 +639,11 @@ def build_bedrock_ragas_llm(run_config):
                 callbacks,
             )
-    region = os.getenv("AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-1"))
     model = os.getenv(
         "EVAL_MODEL",
-        os.getenv("BEDROCK_EVAL_MODEL", "us.anthropic.claude-haiku-4-5-20251001"),
     )
-    return BedrockRagasLLM(model=model, region=region, run_config=run_config)
 def build_ragas_embeddings(run_config):
@@ -568,8 +710,8 @@ def run_ragas(rows, outputs):
             max_wait=int(os.getenv("EVAL_MAX_WAIT_SECONDS", "60")),
         )
         log(
-            "Using AWS Bedrock for RAGAS judge model "
-            f"({os.getenv('EVAL_MODEL', os.getenv('BEDROCK_EVAL_MODEL', 'us.anthropic.claude-haiku-4-5-20251001'))})"
         )
         log(
             f"RAGAS runtime: async={RAGAS_ASYNC}, raise_exceptions={RAGAS_RAISE_EXCEPTIONS}, "
@@ -596,10 +738,19 @@ def run():
     log(f"Loading eval set from {EVAL_SET_PATH}")
     rows = load_eval_rows()
     audit = validate_eval_rows(rows)
     if audit["errors"]:
         raise RuntimeError("Eval set validation failed: " + "; ".join(audit["errors"]))
     for warning in audit["warnings"]:
         log(f"Eval set warning: {warning}")
     log(
         f"Starting eval with api_url={API_URL}, repo_id={REPO_ID}, "
         f"session_id={SESSION_ID}, top_k={TOP_K}, cases={len(rows)}"
@@ -619,10 +770,13 @@ def run():
         cited_paths = [source["file_path"] for source in result.get("sources", [])]
         metrics = compute_retrieval_metrics(row.get("expected_sources", []), cited_paths)
-        keyword_coverage = keyword_match_ratio(row, result.get("answer", ""))
-        keyword_gate = keyword_pass(row, result.get("answer", ""), keyword_coverage)
         length_metrics = answer_length_metrics(result.get("answer", ""))
-        overlap = lexical_overlap_ratio(row.get("ground_truth", ""), result.get("answer", ""))
         details.append(
             {
@@ -640,7 +794,15 @@ def run():
                 "duplicate_source_rate": metrics["duplicate_source_rate"],
                 "keyword_coverage": keyword_coverage,
                 "keyword_pass": keyword_gate,
                 "ground_truth_lexical_overlap": overlap,
                 **length_metrics,
             }
         )
@@ -659,8 +821,17 @@ def run():
             "repo_id": REPO_ID,
             "session_id": SESSION_ID,
             "top_k": TOP_K,
             "query_timeout_seconds": QUERY_TIMEOUT_SECONDS,
             "eval_set": str(EVAL_SET_PATH),
         },
         "eval_set_audit": audit,
         "headline_metrics": headline_metrics,

 import sys
 import asyncio
 import re
+import time
 from pathlib import Path
 from collections import Counter, defaultdict
 from statistics import mean
 load_dotenv(SERVER_ROOT / ".env")
+from src.bedrock_claude import create_bedrock_runtime_client, generate_bedrock_claude_text
 from src.embeddings import EmbeddingGenerator
 SESSION_ID = os.getenv("CODEBASE_RAG_SESSION_ID", "eval-session")
 TOP_K = int(os.getenv("CODEBASE_RAG_TOP_K", "8"))
 QUERY_TIMEOUT_SECONDS = int(os.getenv("CODEBASE_RAG_QUERY_TIMEOUT_SECONDS", "180"))
+QUERY_MAX_RETRIES = int(os.getenv("CODEBASE_RAG_QUERY_MAX_RETRIES", "5"))
+QUERY_RETRY_BASE_SECONDS = float(os.getenv("CODEBASE_RAG_QUERY_RETRY_BASE_SECONDS", "2"))
 ENABLE_RAGAS = os.getenv("CODEBASE_RAG_ENABLE_RAGAS", "1").lower() not in {"0", "false", "no"}
 RAGAS_ASYNC = os.getenv("CODEBASE_RAG_RAGAS_ASYNC", "0").lower() in {"1", "true", "yes"}
 RAGAS_RAISE_EXCEPTIONS = os.getenv("CODEBASE_RAG_RAGAS_RAISE_EXCEPTIONS", "0").lower() in {
     "true",
     "yes",
 }
+MIN_REFERENCE_OVERLAP = float(os.getenv("CODEBASE_RAG_MIN_REFERENCE_OVERLAP", "0.2"))
+MIN_REFERENCE_TERM_MATCHES = int(os.getenv("CODEBASE_RAG_MIN_REFERENCE_TERM_MATCHES", "2"))
 EVAL_SET_PATH = Path(
     os.getenv(
         "CODEBASE_RAG_EVAL_SET",
     print(f"[eval] {message}", file=sys.stderr, flush=True)
+def get_app_model_config():
+    llm_provider = os.getenv("LLM_PROVIDER", "bedrock").lower()
+    if llm_provider == "groq":
+        llm_model = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
+    elif llm_provider == "bedrock":
+        llm_model = os.getenv(
+            "BEDROCK_LLM_MODEL",
+            "anthropic.claude-sonnet-4-20250514-v1:0",
+        )
+    elif llm_provider == "vertex_ai":
+        llm_model = os.getenv("VERTEX_LLM_MODEL", "claude-sonnet-4@20250514")
+    else:
+        llm_model = "unknown"
+    embedding_provider = os.getenv("EMBEDDING_PROVIDER", "auto").lower()
+    if embedding_provider == "bedrock":
+        embedding_model = os.getenv("BEDROCK_EMBEDDING_MODEL", "cohere.embed-v4:0")
+    elif embedding_provider == "vertex_ai":
+        embedding_model = os.getenv("VERTEX_EMBEDDING_MODEL", "gemini-embedding-001")
+    elif embedding_provider == "openai":
+        embedding_model = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
+    elif embedding_provider == "local":
+        embedding_model = os.getenv("EMBEDDING_MODEL") or os.getenv(
+            "LOCAL_EMBEDDING_MODEL", "nomic-ai/CodeRankEmbed"
+        )
+    else:
+        embedding_model = os.getenv("EMBEDDING_MODEL") or "auto"
+    eval_model = os.getenv(
+        "EVAL_MODEL",
+        os.getenv("BEDROCK_EVAL_MODEL", "anthropic.claude-opus-4-20250514-v1:0"),
+    )
+    return {
+        "llm_provider": llm_provider,
+        "llm_model": llm_model,
+        "embedding_provider": embedding_provider,
+        "embedding_model": embedding_model,
+        "eval_model": eval_model,
+    }
 def load_eval_rows():
     return json.loads(EVAL_SET_PATH.read_text())
         "top_k": TOP_K,
         "history": row.get("turns", []),
     }
+    case_id = row.get("id", row["question"])
+    for attempt in range(1, QUERY_MAX_RETRIES + 1):
+        response = requests.post(
+            f"{API_URL}/api/query",
+            json=payload,
+            headers={"X-Session-Id": SESSION_ID},
+            timeout=QUERY_TIMEOUT_SECONDS,
+        )
+        if response.ok:
+            return response.json()
         detail = response.text
         try:
             parsed = response.json()
             detail = parsed.get("detail") or parsed
         except Exception:
             pass
+        detail_text = str(detail)
+        is_retryable = response.status_code in {429, 500, 502, 503, 504} and any(
+            marker in detail_text
+            for marker in [
+                "ThrottlingException",
+                "Too many requests",
+                "timed out",
+                "timeout",
+                "ServiceUnavailable",
+            ]
+        )
+        if is_retryable and attempt < QUERY_MAX_RETRIES:
+            wait_seconds = QUERY_RETRY_BASE_SECONDS * (2 ** (attempt - 1))
+            log(
+                f"Retrying case {case_id} after transient query failure "
+                f"(attempt {attempt}/{QUERY_MAX_RETRIES}, wait={wait_seconds:.1f}s): {detail_text}"
+            )
+            time.sleep(wait_seconds)
+            continue
         raise RuntimeError(
+            f"Query failed for eval case {case_id!r} "
             f"with status {response.status_code}: {detail}"
         )
+    raise RuntimeError(f"Query failed for eval case {case_id!r}: exhausted retries")
 def normalize_path(path: str) -> str:
     return re.findall(r"[a-z0-9_./+-]+", (text or "").lower())
+def normalize_keywords(keywords):
+    normalized = []
+    seen = set()
+    for keyword in keywords or []:
+        phrase = " ".join(tokenize_text(str(keyword)))
+        if not phrase or phrase in seen:
+            continue
+        seen.add(phrase)
+        normalized.append(phrase)
+    return normalized
 def compute_retrieval_metrics(expected_sources, actual_sources):
     expected = {normalize_path(path) for path in expected_sources}
     actual = [normalize_path(path) for path in actual_sources]
     }
+def keyword_match_details(row, answer: str):
+    keywords = normalize_keywords(row.get("must_include_any", []))
     if not keywords:
         return None
+    answer_tokens = tokenize_text(answer)
+    if not answer_tokens:
+        return {
+            "coverage": 0.0,
+            "matched_count": 0,
+            "total_keywords": len(keywords),
+            "matched_keywords": [],
+            "missing_keywords": keywords,
+        }
+    matched_keywords = []
+    for keyword in keywords:
+        keyword_tokens = keyword.split()
+        window = len(keyword_tokens)
+        if window == 1:
+            if keyword_tokens[0] in answer_tokens:
+                matched_keywords.append(keyword)
+            continue
+        for index in range(0, len(answer_tokens) - window + 1):
+            if answer_tokens[index : index + window] == keyword_tokens:
+                matched_keywords.append(keyword)
+                break
+    matched_set = set(matched_keywords)
+    missing_keywords = [keyword for keyword in keywords if keyword not in matched_set]
+    matched_count = len(matched_set)
+    return {
+        "coverage": matched_count / len(keywords),
+        "matched_count": matched_count,
+        "total_keywords": len(keywords),
+        "matched_keywords": sorted(matched_set),
+        "missing_keywords": missing_keywords,
+    }
+def keyword_pass(row, keyword_details):
+    if keyword_details is None:
         return None
     minimum = int(row.get("min_keyword_matches", 1))
+    return 1 if keyword_details["matched_count"] >= minimum else 0
 def answer_length_metrics(answer: str):
     }
+def reference_support_details(reference: str, candidate: str):
     reference_terms = {
         token for token in tokenize_text(reference)
         if len(token) > 2 and token not in STOPWORDS
     if not reference_terms:
         return None
     candidate_terms = set(tokenize_text(candidate))
+    matched_terms = sorted(token for token in reference_terms if token in candidate_terms)
+    matched_count = len(matched_terms)
+    return {
+        "ratio": matched_count / len(reference_terms),
+        "matched_count": matched_count,
+        "reference_term_count": len(reference_terms),
+        "matched_terms": matched_terms,
+    }
+def reference_support_pass(reference_details):
+    if reference_details is None:
+        return None
+    return 1 if (
+        reference_details["ratio"] >= MIN_REFERENCE_OVERLAP
+        and reference_details["matched_count"] >= MIN_REFERENCE_TERM_MATCHES
+    ) else 0
 def validate_eval_rows(rows):
             errors.append(f"{row_id}: expected_sources must be a non-empty list")
         if must_include_any and not isinstance(must_include_any, list):
             errors.append(f"{row_id}: must_include_any must be a list when present")
+        if isinstance(must_include_any, list):
+            normalized_keywords = normalize_keywords(must_include_any)
+            if len(normalized_keywords) != len([keyword for keyword in must_include_any if str(keyword).strip()]):
+                warnings.append(
+                    f"{row_id}: duplicate or case-variant keywords were normalized; "
+                    "resume metrics are stricter than the raw checklist wording."
+                )
         if row.get("turns"):
             conversation_cases += 1
         expected_source_counts.append(len(expected_sources) if isinstance(expected_sources, list) else 0)
 def summarize_custom_metrics(details):
     keyword_coverages = [item["keyword_coverage"] for item in details if item["keyword_coverage"] is not None]
     keyword_passes = [item["keyword_pass"] for item in details if item["keyword_pass"] is not None]
+    reference_support_passes = [
+        item["reference_support_pass"] for item in details if item["reference_support_pass"] is not None
+    ]
     grounded_answer_passes = [
         1
         for item in details
         if item["retrieval_hit"] == 1
         and item["has_substantive_answer"] == 1
         and (item["keyword_pass"] in {None, 1})
+        and (item["reference_support_pass"] in {None, 1})
     ]
     exact_source_recall_cases = [1 for item in details if item["source_recall"] == 1.0]
     return {
         "duplicate_source_rate": round(mean(item["duplicate_source_rate"] for item in details), 4),
         "keyword_coverage": round(mean(keyword_coverages), 4) if keyword_coverages else None,
         "keyword_pass_rate": round(mean(keyword_passes), 4) if keyword_passes else None,
+        "reference_support_rate": round(mean(reference_support_passes), 4) if reference_support_passes else None,
         "ground_truth_lexical_overlap": round(
             mean(item["ground_truth_lexical_overlap"] for item in details if item["ground_truth_lexical_overlap"] is not None),
             4,
             "source_recall": round(mean(item["source_recall"] for item in items), 4),
             "mrr": round(mean(item["mrr"] for item in items), 4),
             "keyword_pass_rate": round(mean(keyword_passes), 4) if keyword_passes else None,
+            "reference_support_rate": round(
+                mean(
+                    item["reference_support_pass"]
+                    for item in items
+                    if item["reference_support_pass"] is not None
+                ),
+                4,
+            )
+            if any(item["reference_support_pass"] is not None for item in items)
+            else None,
             "grounded_answer_rate": round(
                 mean(
                     1
+                    if item["retrieval_hit"] == 1
+                    and item["has_substantive_answer"] == 1
+                    and item["keyword_pass"] in {None, 1}
+                    and item["reference_support_pass"] in {None, 1}
                     else 0
                     for item in items
                 ),
         "source_recall": custom_metrics["source_recall"],
         "grounded_answer_rate": custom_metrics["grounded_answer_rate"],
         "keyword_pass_rate": custom_metrics["keyword_pass_rate"],
+        "reference_support_rate": custom_metrics["reference_support_rate"],
     }
             f"source recall {custom_metrics['source_recall']:.1%}."
         ),
         (
+            f"Strict answer quality checks: grounded answer rate {custom_metrics['grounded_answer_rate']:.1%}"
             + (
+                f", keyword/checklist pass rate {custom_metrics['keyword_pass_rate']:.1%}"
+                + (
+                    f", reference-support pass rate {custom_metrics['reference_support_rate']:.1%}."
+                    if custom_metrics["reference_support_rate"] is not None
+                    else "."
+                )
                 if custom_metrics["keyword_pass_rate"] is not None
                 else "."
             )
 def build_bedrock_ragas_llm(run_config):
     from langchain_core.outputs import Generation, LLMResult
     from ragas.llms.base import BaseRagasLLM
     class BedrockRagasLLM(BaseRagasLLM):
+        def __init__(self, model: str, run_config):
+            self.client = create_bedrock_runtime_client()
             self.model = model
             self.set_run_config(run_config)
         def _generate_once(self, prompt, n=1, temperature=1e-8, stop=None, callbacks=None):
             prompt_text = self._prompt_to_text(prompt)
+            text, _ = generate_bedrock_claude_text(
+                self.client,
+                self.model,
+                "Return only valid JSON or the exact structured output requested.",
+                prompt_text,
+                max_tokens=int(os.getenv("EVAL_MAX_OUTPUT_TOKENS", "2048")),
+                temperature=0.0,
             )
+            generations = [Generation(text=text)] if text else []
             if not generations:
+                raise RuntimeError("Bedrock Claude judge returned an empty response.")
             return LLMResult(generations=[generations])
                 callbacks,
             )
     model = os.getenv(
         "EVAL_MODEL",
+        os.getenv("BEDROCK_EVAL_MODEL", "anthropic.claude-opus-4-20250514-v1:0"),
     )
+    return BedrockRagasLLM(model=model, run_config=run_config)
 def build_ragas_embeddings(run_config):
             max_wait=int(os.getenv("EVAL_MAX_WAIT_SECONDS", "60")),
         )
         log(
+            "Using Bedrock for RAGAS judge model "
+            f"({os.getenv('EVAL_MODEL', os.getenv('BEDROCK_EVAL_MODEL', 'anthropic.claude-opus-4-20250514-v1:0'))})"
         )
         log(
             f"RAGAS runtime: async={RAGAS_ASYNC}, raise_exceptions={RAGAS_RAISE_EXCEPTIONS}, "
     log(f"Loading eval set from {EVAL_SET_PATH}")
     rows = load_eval_rows()
     audit = validate_eval_rows(rows)
+    model_config = get_app_model_config()
     if audit["errors"]:
         raise RuntimeError("Eval set validation failed: " + "; ".join(audit["errors"]))
     for warning in audit["warnings"]:
         log(f"Eval set warning: {warning}")
+    log(
+        "Eval model config: "
+        f"qna_provider={model_config['llm_provider']}, "
+        f"qna_model={model_config['llm_model']}, "
+        f"embedding_provider={model_config['embedding_provider']}, "
+        f"embedding_model={model_config['embedding_model']}, "
+        f"judge_model={model_config['eval_model']}"
+    )
     log(
         f"Starting eval with api_url={API_URL}, repo_id={REPO_ID}, "
         f"session_id={SESSION_ID}, top_k={TOP_K}, cases={len(rows)}"
         cited_paths = [source["file_path"] for source in result.get("sources", [])]
         metrics = compute_retrieval_metrics(row.get("expected_sources", []), cited_paths)
+        keyword_details = keyword_match_details(row, result.get("answer", ""))
+        keyword_coverage = keyword_details["coverage"] if keyword_details else None
+        keyword_gate = keyword_pass(row, keyword_details)
         length_metrics = answer_length_metrics(result.get("answer", ""))
+        reference_details = reference_support_details(row.get("ground_truth", ""), result.get("answer", ""))
+        overlap = reference_details["ratio"] if reference_details else None
+        reference_gate = reference_support_pass(reference_details)
         details.append(
             {
                 "duplicate_source_rate": metrics["duplicate_source_rate"],
                 "keyword_coverage": keyword_coverage,
                 "keyword_pass": keyword_gate,
+                "matched_keyword_count": keyword_details["matched_count"] if keyword_details else None,
+                "total_keywords": keyword_details["total_keywords"] if keyword_details else None,
+                "matched_keywords": keyword_details["matched_keywords"] if keyword_details else [],
+                "missing_keywords": keyword_details["missing_keywords"] if keyword_details else [],
                 "ground_truth_lexical_overlap": overlap,
+                "reference_support_pass": reference_gate,
+                "reference_term_match_count": reference_details["matched_count"] if reference_details else None,
+                "reference_term_count": reference_details["reference_term_count"] if reference_details else None,
+                "matched_reference_terms": reference_details["matched_terms"] if reference_details else [],
                 **length_metrics,
             }
         )
             "repo_id": REPO_ID,
             "session_id": SESSION_ID,
             "top_k": TOP_K,
+            "qna_provider": model_config["llm_provider"],
+            "qna_model": model_config["llm_model"],
+            "embedding_provider": model_config["embedding_provider"],
+            "embedding_model": model_config["embedding_model"],
+            "eval_model": model_config["eval_model"],
             "query_timeout_seconds": QUERY_TIMEOUT_SECONDS,
+            "query_max_retries": QUERY_MAX_RETRIES,
+            "query_retry_base_seconds": QUERY_RETRY_BASE_SECONDS,
             "eval_set": str(EVAL_SET_PATH),
+            "min_reference_overlap": MIN_REFERENCE_OVERLAP,
+            "min_reference_term_matches": MIN_REFERENCE_TERM_MATCHES,
         },
         "eval_set_audit": audit,
         "headline_metrics": headline_metrics,

evals/sample_eval_set.json CHANGED Viewed

@@ -669,5 +669,58 @@
       "FastAPI"
     ],
     "min_keyword_matches": 2
   }
 ]

       "FastAPI"
     ],
     "min_keyword_matches": 2
+  },
+  {
+    "id": "sqlmodel-sa-column-conflict-error",
+    "category": "error-handling",
+    "question": "What happens if you pass both sa_column and other Field options like primary_key or index in SQLModel?",
+    "ground_truth": "SQLModel raises a ValueError when sa_column is combined with other field-level options like primary_key, index, or foreign_key because sa_column is meant to be a fully self-contained SQLAlchemy column definition and mixing it with SQLModel field shortcuts creates an ambiguous configuration.",
+    "expected_sources": [
+      "sqlmodel/main.py"
+    ],
+    "must_include_any": [
+      "sa_column",
+      "primary_key",
+      "raise",
+      "conflict"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-codegen-basic-model",
+    "category": "code-generation",
+    "question": "Write a SQLModel table model for a User with an integer primary key, a required name string, and an optional email string",
+    "ground_truth": "A correct answer defines a class inheriting from SQLModel with table=True, uses Field(primary_key=True) on an integer id, declares name as a required str, and declares email as Optional[str] with a default of None.",
+    "expected_sources": [
+      "sqlmodel/main.py",
+      "README.md"
+    ],
+    "must_include_any": [
+      "SQLModel",
+      "Field",
+      "table=True",
+      "primary_key",
+      "Optional"
+    ],
+    "min_keyword_matches": 3
+  },
+  {
+    "id": "sqlmodel-codegen-session-query",
+    "category": "code-generation",
+    "question": "Write a SQLModel example that creates an engine, opens a session, inserts a User row, and queries all users",
+    "ground_truth": "A correct answer uses create_engine to set up the database, SQLModel.metadata.create_all to create tables, opens a Session using a context manager, adds and commits a User instance, then uses select(User) with session.exec to retrieve all rows.",
+    "expected_sources": [
+      "README.md",
+      "sqlmodel/__init__.py",
+      "sqlmodel/orm/session.py"
+    ],
+    "must_include_any": [
+      "create_engine",
+      "Session",
+      "select",
+      "exec",
+      "commit"
+    ],
+    "min_keyword_matches": 4
   }
 ]

requirements.txt CHANGED Viewed

@@ -6,6 +6,7 @@ python-dotenv==1.0.1
 openai==1.109.1
 boto3==1.40.58
 google-genai==1.12.1
 httpx==0.28.1
 numpy==1.26.4

 openai==1.109.1
 boto3==1.40.58
+anthropic[vertex]==0.73.0
 google-genai==1.12.1
 httpx==0.28.1
 numpy==1.26.4

src/bedrock_claude.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+from typing import Optional, Tuple
+def create_bedrock_runtime_client():
+    try:
+        import boto3
+    except ImportError as exc:
+        raise RuntimeError("Bedrock Claude support requires the `boto3` package.") from exc
+    return boto3.client(
+        "bedrock-runtime",
+        region_name=os.getenv("AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-1")),
+    )
+def generate_bedrock_claude_text(
+    client,
+    model: str,
+    system_prompt: str,
+    user_prompt: str,
+    *,
+    max_tokens: int,
+    temperature: float,
+    top_p: Optional[float] = None,
+) -> Tuple[str, str]:
+    inference_config = {
+        "maxTokens": max_tokens,
+        "temperature": temperature,
+    }
+    if top_p is not None:
+        inference_config["topP"] = top_p
+    response = client.converse(
+        modelId=model,
+        system=[{"text": system_prompt.strip()}],
+        messages=[
+            {
+                "role": "user",
+                "content": [{"text": user_prompt.strip()}],
+            }
+        ],
+        inferenceConfig=inference_config,
+    )
+    content_blocks = (((response or {}).get("output") or {}).get("message") or {}).get("content") or []
+    text = "".join(block.get("text", "") for block in content_blocks if block.get("text")).strip()
+    if not text:
+        raise RuntimeError("Bedrock Claude returned an empty response.")
+    return text, str((response or {}).get("stopReason", "") or "")

src/embeddings.py CHANGED Viewed

@@ -1,11 +1,10 @@
-import json
 import os
 import time
 from typing import Callable, List, Optional
 import numpy as np
 from openai import OpenAI
-from sentence_transformers import SentenceTransformer
 class EmbeddingGenerator:
@@ -17,6 +16,10 @@ class EmbeddingGenerator:
         self.device = os.getenv("EMBEDDING_DEVICE")
         self.client = None
         self.model = None
         self.vertex_task_type_document = os.getenv(
             "VERTEX_EMBEDDING_TASK_TYPE_DOCUMENT", "RETRIEVAL_DOCUMENT"
         )
@@ -42,6 +45,28 @@ class EmbeddingGenerator:
             )
             self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
             self.embedding_dim = int(os.getenv("OPENAI_EMBEDDING_DIM", "1536"))
         elif self.provider == "vertex_ai":
             print(
                 f"[embeddings] Initializing Vertex AI embeddings with model={self.model_name}",
@@ -55,7 +80,7 @@ class EmbeddingGenerator:
                 ) from exc
             project = os.getenv("GOOGLE_CLOUD_PROJECT")
-            location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
             if not project:
                 raise RuntimeError(
                     "GOOGLE_CLOUD_PROJECT must be set when using Vertex AI embeddings."
@@ -72,22 +97,13 @@ class EmbeddingGenerator:
                     str(self.vertex_output_dimensionality or 3072),
                 )
             )
-        elif self.provider == "bedrock":
-            print(
-                f"[embeddings] Initializing AWS Bedrock embeddings with model={self.model_name}",
-                flush=True,
-            )
             try:
-                import boto3
             except ImportError as exc:
                 raise RuntimeError(
-                    "AWS Bedrock embedding support requires the `boto3` package."
                 ) from exc
-            region = os.getenv("AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-1"))
-            self.client = boto3.client("bedrock-runtime", region_name=region)
-            self.embedding_dim = int(os.getenv("BEDROCK_EMBEDDING_DIM", "1024"))
-        else:
             model_device = self.device or "cpu"
             print(
                 f"[embeddings] Loading local embedding model={self.model_name} on device={model_device}",
@@ -109,13 +125,16 @@ class EmbeddingGenerator:
     def embed_text(self, text: str) -> np.ndarray:
         if self.provider == "openai":
             return self.embed_batch([text])[0]
         if self.provider == "vertex_ai":
             return self._embed_with_vertex(
                 [text],
                 task_type=self.vertex_task_type_query,
             )[0]
-        if self.provider == "bedrock":
-            return self._embed_with_bedrock(text)
         query_text = f"{self.query_prefix}: {text}" if self.query_prefix else text
         return self._encode_with_backoff([query_text], prompt_name=self.query_prompt_name)[0]
@@ -137,19 +156,18 @@ class EmbeddingGenerator:
             if progress_callback:
                 progress_callback(len(texts), len(texts))
             return np.array(embeddings, dtype="float32")
-        if self.provider == "vertex_ai":
-            return self._embed_batch_with_vertex(
                 texts=texts,
                 batch_size=batch_size,
                 progress_callback=progress_callback,
             )
-        if self.provider == "bedrock":
-            return self._embed_batch_with_bedrock(
                 texts=texts,
                 batch_size=batch_size,
                 progress_callback=progress_callback,
             )
         effective_batch_size = max(1, batch_size or self.batch_size)
         all_embeddings = []
         total = len(texts)
@@ -216,36 +234,6 @@ class EmbeddingGenerator:
         return np.vstack(all_embeddings).astype("float32")
-    def _embed_with_vertex(self, texts: List[str], task_type: str) -> np.ndarray:
-        config = {
-            "task_type": task_type,
-        }
-        if self.vertex_output_dimensionality:
-            config["output_dimensionality"] = self.vertex_output_dimensionality
-        response = self.client.models.embed_content(
-            model=self.model_name,
-            contents=texts,
-            config=config,
-        )
-        embeddings = getattr(response, "embeddings", None)
-        if not embeddings:
-            raise RuntimeError("Vertex AI embeddings returned an empty response.")
-        values = []
-        for item in embeddings:
-            if hasattr(item, "values"):
-                values.append(item.values)
-            elif isinstance(item, dict):
-                values.append(item.get("values"))
-            else:
-                values.append(getattr(item, "embedding", None))
-        if not values or any(vector is None for vector in values):
-            raise RuntimeError("Vertex AI embeddings response could not be parsed.")
-        return np.array(values, dtype="float32")
     def _embed_batch_with_bedrock(
         self,
         texts: List[str],
@@ -255,6 +243,7 @@ class EmbeddingGenerator:
         effective_batch_size = max(1, batch_size or self.batch_size)
         all_embeddings = []
         total = len(texts)
         for start in range(0, total, effective_batch_size):
             batch = texts[start : start + effective_batch_size]
@@ -266,8 +255,11 @@ class EmbeddingGenerator:
                 flush=True,
             )
             started_at = time.perf_counter()
-            batch_embeddings = [self._embed_with_bedrock(text) for text in batch]
-            all_embeddings.append(np.vstack(batch_embeddings))
             elapsed = time.perf_counter() - started_at
             print(
                 f"[embeddings] Finished Bedrock batch {batch_number}/{total_batches} "
@@ -279,24 +271,63 @@ class EmbeddingGenerator:
         return np.vstack(all_embeddings).astype("float32")
-    def _embed_with_bedrock(self, text: str) -> np.ndarray:
-        payload = {"inputText": text, "normalize": True}
-        if self.embedding_dim in {256, 512, 1024}:
-            payload["dimensions"] = self.embedding_dim
-        response = self.client.invoke_model(
             modelId=self.model_name,
-            body=json.dumps(payload),
-            accept="application/json",
             contentType="application/json",
         )
-        body = json.loads(response["body"].read())
-        values = body.get("embedding")
-        if values is None:
-            values = (body.get("embeddingsByType") or {}).get("float")
-        if not values:
-            raise RuntimeError("AWS Bedrock embeddings returned an empty response.")
-        return np.array(values, dtype="float32")
     def _encode_with_backoff(
         self,
@@ -349,7 +380,7 @@ class EmbeddingGenerator:
         if explicit_model:
             return explicit_model
         if self.provider == "bedrock":
-            return os.getenv("BEDROCK_EMBEDDING_MODEL", "amazon.titan-embed-text-v2:0")
         if self.provider == "vertex_ai":
             return os.getenv("VERTEX_EMBEDDING_MODEL", "gemini-embedding-001")
         if self._is_hf_space() or self._is_test_context():

 import os
 import time
+import json
 from typing import Callable, List, Optional
 import numpy as np
 from openai import OpenAI
 class EmbeddingGenerator:
         self.device = os.getenv("EMBEDDING_DEVICE")
         self.client = None
         self.model = None
+        self.bedrock_client = None
+        self.bedrock_output_dimensionality = self._optional_int(
+            os.getenv("BEDROCK_EMBEDDING_OUTPUT_DIMENSIONALITY")
+        )
         self.vertex_task_type_document = os.getenv(
             "VERTEX_EMBEDDING_TASK_TYPE_DOCUMENT", "RETRIEVAL_DOCUMENT"
         )
             )
             self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
             self.embedding_dim = int(os.getenv("OPENAI_EMBEDDING_DIM", "1536"))
+        elif self.provider == "bedrock":
+            print(
+                f"[embeddings] Initializing Bedrock embeddings with model={self.model_name}",
+                flush=True,
+            )
+            try:
+                import boto3
+            except ImportError as exc:
+                raise RuntimeError(
+                    "Bedrock embedding support requires the `boto3` package."
+                ) from exc
+            self.bedrock_client = boto3.client(
+                "bedrock-runtime",
+                region_name=os.getenv("AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-1")),
+            )
+            self.embedding_dim = int(
+                os.getenv(
+                    "BEDROCK_EMBEDDING_DIM",
+                    str(self.bedrock_output_dimensionality or 1536),
+                )
+            )
         elif self.provider == "vertex_ai":
             print(
                 f"[embeddings] Initializing Vertex AI embeddings with model={self.model_name}",
                 ) from exc
             project = os.getenv("GOOGLE_CLOUD_PROJECT")
+            location = os.getenv("GOOGLE_CLOUD_LOCATION", "global")
             if not project:
                 raise RuntimeError(
                     "GOOGLE_CLOUD_PROJECT must be set when using Vertex AI embeddings."
                     str(self.vertex_output_dimensionality or 3072),
                 )
             )
+        else:
             try:
+                from sentence_transformers import SentenceTransformer
             except ImportError as exc:
                 raise RuntimeError(
+                    "Local embedding support requires the `sentence-transformers` package."
                 ) from exc
             model_device = self.device or "cpu"
             print(
                 f"[embeddings] Loading local embedding model={self.model_name} on device={model_device}",
     def embed_text(self, text: str) -> np.ndarray:
         if self.provider == "openai":
             return self.embed_batch([text])[0]
+        if self.provider == "bedrock":
+            return self._embed_with_bedrock(
+                [text],
+                input_type=os.getenv("BEDROCK_EMBEDDING_INPUT_TYPE_QUERY", "search_query"),
+            )[0]
         if self.provider == "vertex_ai":
             return self._embed_with_vertex(
                 [text],
                 task_type=self.vertex_task_type_query,
             )[0]
         query_text = f"{self.query_prefix}: {text}" if self.query_prefix else text
         return self._encode_with_backoff([query_text], prompt_name=self.query_prompt_name)[0]
             if progress_callback:
                 progress_callback(len(texts), len(texts))
             return np.array(embeddings, dtype="float32")
+        if self.provider == "bedrock":
+            return self._embed_batch_with_bedrock(
                 texts=texts,
                 batch_size=batch_size,
                 progress_callback=progress_callback,
             )
+        if self.provider == "vertex_ai":
+            return self._embed_batch_with_vertex(
                 texts=texts,
                 batch_size=batch_size,
                 progress_callback=progress_callback,
             )
         effective_batch_size = max(1, batch_size or self.batch_size)
         all_embeddings = []
         total = len(texts)
         return np.vstack(all_embeddings).astype("float32")
     def _embed_batch_with_bedrock(
         self,
         texts: List[str],
         effective_batch_size = max(1, batch_size or self.batch_size)
         all_embeddings = []
         total = len(texts)
+        document_input_type = os.getenv("BEDROCK_EMBEDDING_INPUT_TYPE_DOCUMENT", "search_document")
         for start in range(0, total, effective_batch_size):
             batch = texts[start : start + effective_batch_size]
                 flush=True,
             )
             started_at = time.perf_counter()
+            batch_embeddings = self._embed_with_bedrock(
+                batch,
+                input_type=document_input_type,
+            )
+            all_embeddings.append(batch_embeddings)
             elapsed = time.perf_counter() - started_at
             print(
                 f"[embeddings] Finished Bedrock batch {batch_number}/{total_batches} "
         return np.vstack(all_embeddings).astype("float32")
+    def _embed_with_vertex(self, texts: List[str], task_type: str) -> np.ndarray:
+        config = {
+            "task_type": task_type,
+        }
+        if self.vertex_output_dimensionality:
+            config["output_dimensionality"] = self.vertex_output_dimensionality
+        response = self.client.models.embed_content(
+            model=self.model_name,
+            contents=texts,
+            config=config,
+        )
+        embeddings = getattr(response, "embeddings", None)
+        if not embeddings:
+            raise RuntimeError("Vertex AI embeddings returned an empty response.")
+        values = []
+        for item in embeddings:
+            if hasattr(item, "values"):
+                values.append(item.values)
+            elif isinstance(item, dict):
+                values.append(item.get("values"))
+            else:
+                values.append(getattr(item, "embedding", None))
+        if not values or any(vector is None for vector in values):
+            raise RuntimeError("Vertex AI embeddings response could not be parsed.")
+        return np.array(values, dtype="float32")
+    def _embed_with_bedrock(self, texts: List[str], input_type: str) -> np.ndarray:
+        response = self.bedrock_client.invoke_model(
             modelId=self.model_name,
             contentType="application/json",
+            accept="application/json",
+            body=json.dumps(self._build_bedrock_embedding_request(texts, input_type)),
         )
+        payload = json.loads(response["body"].read())
+        embeddings = payload.get("embeddings")
+        if isinstance(embeddings, dict):
+            embeddings = embeddings.get("float")
+        if not embeddings:
+            raise RuntimeError("Bedrock embeddings returned an empty response.")
+        return np.array(embeddings, dtype="float32")
+    def _build_bedrock_embedding_request(self, texts: List[str], input_type: str) -> dict:
+        payload = {
+            "texts": texts,
+            "input_type": input_type,
+            "embedding_types": ["float"],
+        }
+        if self.bedrock_output_dimensionality:
+            payload["output_dimension"] = self.bedrock_output_dimensionality
+        return payload
     def _encode_with_backoff(
         self,
         if explicit_model:
             return explicit_model
         if self.provider == "bedrock":
+            return os.getenv("BEDROCK_EMBEDDING_MODEL", "cohere.embed-v4:0")
         if self.provider == "vertex_ai":
             return os.getenv("VERTEX_EMBEDDING_MODEL", "gemini-embedding-001")
         if self._is_hf_space() or self._is_test_context():

src/rag_system.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Dict, List, Optional
 from openai import OpenAI
 from src.code_parser import CodeParser
 from src.database import Repository, get_db_session, init_db, resolve_database_url
 from src.embeddings import EmbeddingGenerator
 from src.hybrid_search import HybridSearchEngine
@@ -526,6 +527,14 @@ Do not leave the answer unfinished.
         }
     def _configure_llm(self):
         if self.llm_provider == "groq":
             self.llm_client = OpenAI(
                 api_key=os.getenv("GROQ_API_KEY"),
@@ -534,48 +543,53 @@ Do not leave the answer unfinished.
             self.llm_model = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
             return
-        if self.llm_provider == "bedrock":
-            try:
-                import boto3
-            except ImportError as exc:
                 raise RuntimeError(
-                    "AWS Bedrock LLM support requires the `boto3` package."
-                ) from exc
-            region = os.getenv("AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-1"))
-            self.llm_client = boto3.client("bedrock-runtime", region_name=region)
-            self.llm_model = os.getenv(
-                "BEDROCK_LLM_MODEL", "us.meta.llama3-3-70b-instruct-v1:0"
-            )
-            return
-        if self.llm_provider == "vertex_ai":
             try:
                 from google import genai
             except ImportError as exc:
                 raise RuntimeError(
-                    "Vertex AI LLM support requires the `google-genai` package. "
-                    "Install server dependencies before running local or eval queries."
                 ) from exc
-            project = os.getenv("GOOGLE_CLOUD_PROJECT")
-            location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
-            if not project:
-                raise RuntimeError(
-                    "GOOGLE_CLOUD_PROJECT must be set when using Vertex AI Gemini."
-                )
             self.llm_client = genai.Client(
                 vertexai=True,
                 project=project,
                 location=location,
             )
-            self.llm_model = os.getenv("VERTEX_LLM_MODEL", "gemini-2.5-pro")
             return
         raise RuntimeError(f"Unsupported LLM provider: {self.llm_provider}")
     def _generate_markdown_response(self, system_prompt: str, user_prompt: str) -> tuple[str, str]:
         if self.llm_provider == "groq":
             response = self.llm_client.chat.completions.create(
                 model=self.llm_model,
@@ -590,29 +604,26 @@ Do not leave the answer unfinished.
             finish_reason = getattr(response.choices[0], "finish_reason", "") or ""
             return self._normalize_markdown_answer(content), str(finish_reason)
-        if self.llm_provider == "bedrock":
-            response = self.llm_client.converse(
-                modelId=self.llm_model,
-                system=[{"text": system_prompt.strip()}],
                 messages=[
                     {
                         "role": "user",
-                        "content": [{"text": user_prompt.strip()}],
                     }
                 ],
-                inferenceConfig={
-                    "temperature": 0.1,
-                    "maxTokens": 2200,
-                },
             )
-            output_message = (response.get("output") or {}).get("message") or {}
-            content_blocks = output_message.get("content") or []
             text = "".join(
-                block.get("text", "") for block in content_blocks if isinstance(block, dict)
             )
             if not text.strip():
-                raise RuntimeError("AWS Bedrock returned an empty response.")
-            stop_reason = response.get("stopReason", "") or ""
             return self._normalize_markdown_answer(text), str(stop_reason)
         response = self.llm_client.models.generate_content(

 from openai import OpenAI
 from src.code_parser import CodeParser
+from src.bedrock_claude import create_bedrock_runtime_client, generate_bedrock_claude_text
 from src.database import Repository, get_db_session, init_db, resolve_database_url
 from src.embeddings import EmbeddingGenerator
 from src.hybrid_search import HybridSearchEngine
         }
     def _configure_llm(self):
+        if self.llm_provider == "bedrock":
+            self.llm_client = create_bedrock_runtime_client()
+            self.llm_model = os.getenv(
+                "BEDROCK_LLM_MODEL",
+                "anthropic.claude-sonnet-4-20250514-v1:0",
+            )
+            return
         if self.llm_provider == "groq":
             self.llm_client = OpenAI(
                 api_key=os.getenv("GROQ_API_KEY"),
             self.llm_model = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
             return
+        if self.llm_provider == "vertex_ai":
+            project = os.getenv("GOOGLE_CLOUD_PROJECT")
+            location = os.getenv("GOOGLE_CLOUD_LOCATION", "global")
+            if not project:
                 raise RuntimeError(
+                    "GOOGLE_CLOUD_PROJECT must be set when using Vertex AI LLMs."
+                )
+            self.llm_model = os.getenv("VERTEX_LLM_MODEL", "claude-sonnet-4@20250514")
+            if self.llm_model.startswith("claude-"):
+                try:
+                    from anthropic import AnthropicVertex
+                except ImportError as exc:
+                    raise RuntimeError(
+                        "Vertex AI Claude support requires the `anthropic[vertex]` package."
+                    ) from exc
+                self.llm_client = AnthropicVertex(project_id=project, region=location)
+                return
             try:
                 from google import genai
             except ImportError as exc:
                 raise RuntimeError(
+                    "Vertex AI Gemini support requires the `google-genai` package."
                 ) from exc
             self.llm_client = genai.Client(
                 vertexai=True,
                 project=project,
                 location=location,
             )
             return
         raise RuntimeError(f"Unsupported LLM provider: {self.llm_provider}")
     def _generate_markdown_response(self, system_prompt: str, user_prompt: str) -> tuple[str, str]:
+        if self.llm_provider == "bedrock":
+            text, stop_reason = generate_bedrock_claude_text(
+                self.llm_client,
+                self.llm_model,
+                system_prompt,
+                user_prompt,
+                max_tokens=2200,
+                temperature=0.1,
+            )
+            return self._normalize_markdown_answer(text), stop_reason
         if self.llm_provider == "groq":
             response = self.llm_client.chat.completions.create(
                 model=self.llm_model,
             finish_reason = getattr(response.choices[0], "finish_reason", "") or ""
             return self._normalize_markdown_answer(content), str(finish_reason)
+        if self.llm_provider == "vertex_ai" and self.llm_model.startswith("claude-"):
+            message = self.llm_client.messages.create(
+                model=self.llm_model,
+                system=system_prompt.strip(),
+                max_tokens=2200,
+                temperature=0.1,
                 messages=[
                     {
                         "role": "user",
+                        "content": user_prompt.strip(),
                     }
                 ],
             )
+            content_blocks = getattr(message, "content", None) or []
             text = "".join(
+                getattr(block, "text", "") for block in content_blocks if getattr(block, "text", "")
             )
             if not text.strip():
+                raise RuntimeError("Vertex AI Claude returned an empty response.")
+            stop_reason = getattr(message, "stop_reason", "") or ""
             return self._normalize_markdown_answer(text), str(stop_reason)
         response = self.llm_client.models.generate_content(