Therapy-Model-Evaluator / src /evaluation.py
navaneethkrishnan's picture
Upload 3 files
b2171fc verified
import re
import json
import datetime
import pandas as pd
from src.api_clients import BACKENDS
def split_json_objects(s):
objs, depth, start = [], 0, None
for i, ch in enumerate(s):
if ch == "{":
if depth == 0:
start = i
depth += 1
elif ch == "}":
depth -= 1
if depth == 0 and start is not None:
objs.append(s[start:i+1])
return objs
def evaluate_with_judges(conversation, selected_models, variant, *weights_and_temp, prompt_template):
weights, temperature = list(weights_and_temp[:-1]), weights_and_temp[-1]
if not conversation.strip():
raise ValueError("Conversation input is empty.")
from src.conversation import structure_conversation
structured = structure_conversation(conversation)
system_msg = (
"You are Judge-Care-Lock, a rigorous evaluator of AI-therapist dialogues.\n"
"1. Use ONLY the transcript—quote it for every decision.\n"
"2. Apply the multi-layer rubric exactly; do NOT invent scales.\n"
"3. Return valid JSON matching the schema; no extra text."
)
user_prompt = prompt_template.replace("{CONVERSATION}", structured)
metrics_rows = []
comments_map = {}
tokens_map = {}
pros_map = {}
cons_map = {}
summary_map = {}
for model_name in selected_models:
fn = BACKENDS[model_name]
raw, toks = fn(system_msg, user_prompt, temperature)
tokens_map[model_name] = toks
clean = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE).strip()
objs = split_json_objects(clean)
if not objs:
raise ValueError(f"No valid JSON from {model_name}:\n{clean}")
try:
parsed = json.loads(objs[0])
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON from {model_name}: {str(e)}")
row = {"Model": model_name}
total_score = 0.0
for idx, (m, data) in enumerate(parsed["metrics"].items()):
score = data.get("score", 0.0)
row[m] = score
total_score += score * weights[idx]
row["Total"] = round(total_score, 2)
metrics_rows.append(row)
comments_map[model_name] = parsed
pros_map[model_name] = parsed.get("positive", [])
cons_map[model_name] = parsed.get("negative", [])
summary_map[model_name] = parsed.get("summary", "")
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"/tmp/carelock_eval_{ts}.json"
combined = {
"metrics_table": metrics_rows,
"parsed_per_model": comments_map,
"tokens_per_model": tokens_map,
"pros_per_model": pros_map,
"cons_per_model": cons_map,
"summary_per_model": summary_map
}
with open(filename, "w", encoding="utf-8") as f:
json.dump(combined, f, indent=2)
return (pd.DataFrame(metrics_rows), comments_map, tokens_map,
pros_map, cons_map, summary_map, filename)