navaneethkrishnan commited on
Commit
bf76155
·
verified ·
1 Parent(s): 6ba4a5e

Create test_runner.py

Browse files
Files changed (1) hide show
  1. tests/test_runner.py +68 -0
tests/test_runner.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tests/test_runner.py
2
+ """
3
+ FinanceEval – Safe Test Runner
4
+ No runtime writes. Uses only in-memory comparisons.
5
+ """
6
+
7
+ import os, json
8
+ import pandas as pd
9
+ from core.providers import get_provider, ProviderKind
10
+ from core.preprocess import normalize_conversation, extract_model_utterances
11
+ from core.evaluators import evaluate_all_metrics
12
+ from core.fusion import weighted_total
13
+ from core.schema import METRIC_ORDER
14
+
15
+ # Load static test inputs (read-only)
16
+ TEST_INPUTS_PATH = os.path.join(os.path.dirname(__file__), "redteam_inputs.jsonl")
17
+ GOLDEN_OUTPUTS_PATH = os.path.join(os.path.dirname(__file__), "golden_outputs.json")
18
+
19
+ def load_redteam_inputs():
20
+ with open(TEST_INPUTS_PATH, "r") as f:
21
+ return [json.loads(line) for line in f]
22
+
23
+ def load_golden_outputs():
24
+ if os.path.exists(GOLDEN_OUTPUTS_PATH):
25
+ with open(GOLDEN_OUTPUTS_PATH, "r") as f:
26
+ return json.load(f)
27
+ return {}
28
+
29
+ def run_one(provider, conversation_text, alpha_map):
30
+ norm = normalize_conversation(conversation_text)
31
+ model_only = extract_model_utterances(norm)
32
+ metrics_out, usage, raw_json = evaluate_all_metrics(
33
+ provider=provider,
34
+ conversation_text=model_only,
35
+ alpha_map=alpha_map
36
+ )
37
+ # Return dict only, no file writes
38
+ return {
39
+ "metrics": {m: v["fused_0_10"] for m, v in metrics_out.items()},
40
+ "usage": usage,
41
+ "raw": raw_json
42
+ }
43
+
44
+ if __name__ == "__main__":
45
+ alpha_map = {
46
+ "trust": 0.70, "accuracy": 0.65, "explain": 0.50,
47
+ "client_first": 0.70, "risk_safety": 0.60, "clarity": 0.70
48
+ }
49
+
50
+ inputs = load_redteam_inputs()
51
+ goldens = load_golden_outputs()
52
+
53
+ # Example: run against OpenAI GPT-4o if key is available
54
+ if os.environ.get("OPENAI_API_KEY"):
55
+ provider = get_provider(ProviderKind.OPENAI, "gpt-4o")
56
+ for case in inputs:
57
+ convo = case["conversation"]
58
+ notes = case.get("notes", "")
59
+ result = run_one(provider, convo, alpha_map)
60
+ print("=== CASE ===")
61
+ print(notes)
62
+ print(pd.DataFrame([result["metrics"]]))
63
+ print("Token usage:", result["usage"])
64
+ # Golden comparison (if available)
65
+ # No saving, just console diff
66
+ case_key = notes or convo[:30]
67
+ if case_key in goldens:
68
+ print("Golden vs Result:", goldens[case_key], result["metrics"])