# tests/test_runner.py """ FinanceEval – Safe Test Runner No runtime writes. Uses only in-memory comparisons. """ import os, json import pandas as pd from core.providers import get_provider, ProviderKind from core.preprocess import normalize_conversation, extract_model_utterances from core.evaluators import evaluate_all_metrics from core.fusion import weighted_total from core.schema import METRIC_ORDER # Load static test inputs (read-only) TEST_INPUTS_PATH = os.path.join(os.path.dirname(__file__), "redteam_inputs.jsonl") GOLDEN_OUTPUTS_PATH = os.path.join(os.path.dirname(__file__), "golden_outputs.json") def load_redteam_inputs(): with open(TEST_INPUTS_PATH, "r") as f: return [json.loads(line) for line in f] def load_golden_outputs(): if os.path.exists(GOLDEN_OUTPUTS_PATH): with open(GOLDEN_OUTPUTS_PATH, "r") as f: return json.load(f) return {} def run_one(provider, conversation_text, alpha_map): norm = normalize_conversation(conversation_text) model_only = extract_model_utterances(norm) metrics_out, usage, raw_json = evaluate_all_metrics( provider=provider, conversation_text=model_only, alpha_map=alpha_map ) # Return dict only, no file writes return { "metrics": {m: v["fused_0_10"] for m, v in metrics_out.items()}, "usage": usage, "raw": raw_json } if __name__ == "__main__": alpha_map = { "trust": 0.70, "accuracy": 0.65, "explain": 0.50, "client_first": 0.70, "risk_safety": 0.60, "clarity": 0.70 } inputs = load_redteam_inputs() goldens = load_golden_outputs() # Example: run against OpenAI GPT-4o if key is available if os.environ.get("OPENAI_API_KEY"): provider = get_provider(ProviderKind.OPENAI, "gpt-4o") for case in inputs: convo = case["conversation"] notes = case.get("notes", "") result = run_one(provider, convo, alpha_map) print("=== CASE ===") print(notes) print(pd.DataFrame([result["metrics"]])) print("Token usage:", result["usage"]) # Golden comparison (if available) # No saving, just console diff case_key = notes or convo[:30] if case_key in goldens: print("Golden vs Result:", goldens[case_key], result["metrics"])