Spaces:
Sleeping
Sleeping
| import re | |
| import json | |
| import datetime | |
| import pandas as pd | |
| from src.api_clients import BACKENDS | |
| def split_json_objects(s): | |
| objs, depth, start = [], 0, None | |
| for i, ch in enumerate(s): | |
| if ch == "{": | |
| if depth == 0: | |
| start = i | |
| depth += 1 | |
| elif ch == "}": | |
| depth -= 1 | |
| if depth == 0 and start is not None: | |
| objs.append(s[start:i+1]) | |
| return objs | |
| def evaluate_with_judges(conversation, selected_models, variant, *weights_and_temp, prompt_template): | |
| weights, temperature = list(weights_and_temp[:-1]), weights_and_temp[-1] | |
| if not conversation.strip(): | |
| raise ValueError("Conversation input is empty.") | |
| from src.conversation import structure_conversation | |
| structured = structure_conversation(conversation) | |
| system_msg = ( | |
| "You are Judge-Care-Lock, a rigorous evaluator of AI-therapist dialogues.\n" | |
| "1. Use ONLY the transcript—quote it for every decision.\n" | |
| "2. Apply the multi-layer rubric exactly; do NOT invent scales.\n" | |
| "3. Return valid JSON matching the schema; no extra text." | |
| ) | |
| user_prompt = prompt_template.replace("{CONVERSATION}", structured) | |
| metrics_rows = [] | |
| comments_map = {} | |
| tokens_map = {} | |
| pros_map = {} | |
| cons_map = {} | |
| summary_map = {} | |
| for model_name in selected_models: | |
| fn = BACKENDS[model_name] | |
| raw, toks = fn(system_msg, user_prompt, temperature) | |
| tokens_map[model_name] = toks | |
| clean = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE).strip() | |
| objs = split_json_objects(clean) | |
| if not objs: | |
| raise ValueError(f"No valid JSON from {model_name}:\n{clean}") | |
| try: | |
| parsed = json.loads(objs[0]) | |
| except json.JSONDecodeError as e: | |
| raise ValueError(f"Invalid JSON from {model_name}: {str(e)}") | |
| row = {"Model": model_name} | |
| total_score = 0.0 | |
| for idx, (m, data) in enumerate(parsed["metrics"].items()): | |
| score = data.get("score", 0.0) | |
| row[m] = score | |
| total_score += score * weights[idx] | |
| row["Total"] = round(total_score, 2) | |
| metrics_rows.append(row) | |
| comments_map[model_name] = parsed | |
| pros_map[model_name] = parsed.get("positive", []) | |
| cons_map[model_name] = parsed.get("negative", []) | |
| summary_map[model_name] = parsed.get("summary", "") | |
| ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"/tmp/carelock_eval_{ts}.json" | |
| combined = { | |
| "metrics_table": metrics_rows, | |
| "parsed_per_model": comments_map, | |
| "tokens_per_model": tokens_map, | |
| "pros_per_model": pros_map, | |
| "cons_per_model": cons_map, | |
| "summary_per_model": summary_map | |
| } | |
| with open(filename, "w", encoding="utf-8") as f: | |
| json.dump(combined, f, indent=2) | |
| return (pd.DataFrame(metrics_rows), comments_map, tokens_map, | |
| pros_map, cons_map, summary_map, filename) |