navaneethkrishnan commited on
Commit
2c49fd3
·
verified ·
1 Parent(s): af3a8fe

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +210 -0
app.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, io, json, time, tempfile, zipfile
2
+ import gradio as gr
3
+ import pandas as pd
4
+ from datetime import datetime, timezone
5
+
6
+ from core.providers import get_provider, ProviderKind
7
+ from core.preprocess import normalize_conversation, extract_model_utterances
8
+ from core.evaluators import evaluate_all_metrics
9
+ from core.fusion import weighted_total
10
+
11
+ # -----------------------------
12
+ # Defaults
13
+ # -----------------------------
14
+ DEFAULT_METRIC_WEIGHTS = {
15
+ "trust": 0.20,
16
+ "accuracy": 0.25,
17
+ "explain": 0.15,
18
+ "client_first": 0.15,
19
+ "risk_safety": 0.15,
20
+ "clarity": 0.10,
21
+ }
22
+
23
+ JUDGE_ALPHA = {
24
+ # α = LLM share in fusion per metric (from spec)
25
+ "trust": 0.70,
26
+ "accuracy": 0.65,
27
+ "explain": 0.50,
28
+ "client_first": 0.70,
29
+ "risk_safety": 0.60,
30
+ "clarity": 0.70,
31
+ }
32
+
33
+ # -----------------------------
34
+ # Core runner
35
+ # -----------------------------
36
+
37
+ def run_eval(conversation: str,
38
+ use_openai: bool,
39
+ use_anthropic: bool,
40
+ w_trust: float, w_accuracy: float, w_explain: float,
41
+ w_client: float, w_risk: float, w_clarity: float,
42
+ model_openai: str = "gpt-4o",
43
+ model_anthropic: str = "claude-3-5-sonnet-20240620"):
44
+ if not conversation or conversation.strip() == "":
45
+ return None, None, None, None, "Please paste a conversation to evaluate."
46
+
47
+ # Normalize metric weights
48
+ user_weights = {
49
+ "trust": w_trust,
50
+ "accuracy": w_accuracy,
51
+ "explain": w_explain,
52
+ "client_first": w_client,
53
+ "risk_safety": w_risk,
54
+ "clarity": w_clarity,
55
+ }
56
+ s = sum(user_weights.values()) or 1.0
57
+ for k in user_weights:
58
+ user_weights[k] = user_weights[k] / s
59
+
60
+ # Preprocess conversation
61
+ norm = normalize_conversation(conversation)
62
+ # Try to isolate model utterances (LLM fallback inside if ambiguous)
63
+ model_only = extract_model_utterances(norm, prefer_llm_provider=(model_openai if use_openai else (model_anthropic if use_anthropic else None)))
64
+
65
+ providers = []
66
+ if use_openai:
67
+ providers.append(get_provider(ProviderKind.OPENAI, model_openai))
68
+ if use_anthropic:
69
+ providers.append(get_provider(ProviderKind.ANTHROPIC, model_anthropic))
70
+
71
+ if not providers:
72
+ return None, None, None, None, "Select at least one model provider."
73
+
74
+ all_tables = []
75
+ compare_rows = []
76
+ token_usage_blocks = []
77
+ json_blobs = {}
78
+
79
+ for p in providers:
80
+ metrics_out, usage, raw_json = evaluate_all_metrics(provider=p,
81
+ conversation_text=model_only,
82
+ alpha_map=JUDGE_ALPHA)
83
+ # Build table
84
+ rows = []
85
+ for m, payload in metrics_out.items():
86
+ rows.append({
87
+ "Metric": m,
88
+ "LLM Score (1-5)": payload.get("judge_score", None),
89
+ "NLP Subscore (0-1)": round(payload.get("nlp_subscore", 0.0), 3),
90
+ "Fused (0-10)": round(payload.get("fused_0_10", 0.0), 2),
91
+ "Comment": payload.get("comment", "")
92
+ })
93
+ df = pd.DataFrame(rows)
94
+ total = weighted_total({k: v.get("fused_0_10", 0.0) for k, v in metrics_out.items()}, user_weights)
95
+ compare_rows.append({"Model": p.label, **{r["Metric"]: r["Fused (0-10)"] for _, r in df.iterrows()}, "Total (0-10)": round(total, 2)})
96
+
97
+ # Token usage
98
+ usage_text = f"{p.label}: prompt_tokens={usage.get('prompt',0)}, completion_tokens={usage.get('completion',0)}, total={usage.get('total',0)}"
99
+ token_usage_blocks.append(usage_text)
100
+
101
+ # Persist JSON blob per model
102
+ json_blobs[p.label] = raw_json
103
+
104
+ all_tables.append((p.label, df, round(total, 2)))
105
+
106
+ # Comparison table
107
+ compare_df = pd.DataFrame(compare_rows)
108
+
109
+ # If 2 models, compute an average row
110
+ avg_df = None
111
+ if len(providers) > 1:
112
+ # Average across numeric columns only
113
+ num_cols = [c for c in compare_df.columns if c != "Model"]
114
+ avg_row = {"Model": "Average"}
115
+ for c in num_cols:
116
+ avg_row[c] = round(compare_df[c].mean(), 2)
117
+ avg_df = pd.DataFrame([avg_row])
118
+
119
+ # Build downloadable CSV and ZIP
120
+ ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
121
+ out_dir = tempfile.mkdtemp(prefix="financeeval_")
122
+
123
+ # Write per-model CSVs
124
+ csv_paths = []
125
+ for label, df, total in all_tables:
126
+ pth = os.path.join(out_dir, f"results_{label}_{ts}.csv")
127
+ df2 = df.copy()
128
+ df2.loc[len(df2)] = {"Metric": "TOTAL", "LLM Score (1-5)": "-", "NLP Subscore (0-1)": "-", "Fused (0-10)": total, "Comment": ""}
129
+ df2.to_csv(pth, index=False)
130
+ csv_paths.append(pth)
131
+
132
+ # Comparison CSV
133
+ comp_path = os.path.join(out_dir, f"comparison_{ts}.csv")
134
+ compare_df.to_csv(comp_path, index=False)
135
+
136
+ # JSON outputs
137
+ json_path = os.path.join(out_dir, f"judgments_{ts}.json")
138
+ with open(json_path, 'w') as f:
139
+ json.dump(json_blobs, f, indent=2)
140
+
141
+ # Make ZIP
142
+ zip_path = os.path.join(out_dir, f"financeeval_{ts}.zip")
143
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
144
+ for pth in csv_paths + [comp_path, json_path]:
145
+ zf.write(pth, arcname=os.path.basename(pth))
146
+
147
+ # Return artifacts
148
+ merged_tables = []
149
+ for label, df, total in all_tables:
150
+ merged_tables.append(pd.DataFrame({"Metric": [f"— {label} —"], "LLM Score (1-5)": [""], "NLP Subscore (0-1)": [""], "Fused (0-10)": [""], "Comment": [""]}))
151
+ merged_tables.append(df)
152
+ merged_df = pd.concat(merged_tables, ignore_index=True)
153
+
154
+ usage_text_all = "\n".join(token_usage_blocks)
155
+
156
+ return merged_df, compare_df, (avg_df if avg_df is not None else pd.DataFrame()), zip_path, usage_text_all
157
+
158
+ # -----------------------------
159
+ # UI
160
+ # -----------------------------
161
+ with gr.Blocks(title="FinanceEval – Hybrid Judge (Gradio)") as demo:
162
+ gr.Markdown("""
163
+ # 🔎 FinanceEval – Hybrid Evaluation (Gradio / HF Spaces)
164
+ Paste a finance conversation. Choose one or both judge models (OpenAI GPT‑4o, Claude 3.5 Sonnet). Adjust metric weights. Click **Evaluate**.
165
+ """)
166
+
167
+ with gr.Row():
168
+ conversation = gr.Textbox(label="Conversation", lines=16, placeholder="Paste the full transcript here...")
169
+
170
+ with gr.Accordion("Model Selection", open=True):
171
+ with gr.Row():
172
+ use_openai = gr.Checkbox(value=True, label="Use OpenAI GPT‑4o")
173
+ use_anthropic = gr.Checkbox(value=False, label="Use Claude 3.5 Sonnet")
174
+ with gr.Row():
175
+ model_openai = gr.Textbox(value="gpt-4o", label="OpenAI model name")
176
+ model_anthropic = gr.Textbox(value="claude-3-5-sonnet-20240620", label="Anthropic model name")
177
+ gr.Markdown("**Secrets**: Set `OPENAI_API_KEY` and / or `ANTHROPIC_API_KEY` in your Space settings.")
178
+
179
+ with gr.Accordion("Metric Weights (affect only the TOTAL)", open=True):
180
+ with gr.Row():
181
+ w_trust = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["trust"], step=0.01, label="Trust")
182
+ w_accuracy = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["accuracy"], step=0.01, label="Accuracy")
183
+ w_explain = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["explain"], step=0.01, label="Explainability")
184
+ with gr.Row():
185
+ w_client = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["client_first"], step=0.01, label="Client‑First")
186
+ w_risk = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["risk_safety"], step=0.01, label="Risk Safety")
187
+ w_clarity = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["clarity"], step=0.01, label="Clarity")
188
+ gr.Markdown("Weights are normalized to sum to 1 before computing the TOTAL.")
189
+
190
+ run_btn = gr.Button("Evaluate")
191
+
192
+ with gr.Tab("Per‑Model Results"):
193
+ table_out = gr.Dataframe(label="Metric Scores & Comments (stacked per model)")
194
+ with gr.Tab("Comparison"):
195
+ compare_out = gr.Dataframe(label="Model Comparison (per metric + TOTAL)")
196
+ avg_out = gr.Dataframe(label="Average (if multiple models)")
197
+ with gr.Tab("Downloads & Usage"):
198
+ zip_file = gr.File(label="Download ZIP (CSVs + JSON)")
199
+ usage_text = gr.Textbox(label="Token Usage", lines=4)
200
+
201
+ run_btn.click(
202
+ fn=run_eval,
203
+ inputs=[conversation, use_openai, use_anthropic,
204
+ w_trust, w_accuracy, w_explain, w_client, w_risk, w_clarity,
205
+ model_openai, model_anthropic],
206
+ outputs=[table_out, compare_out, avg_out, zip_file, usage_text]
207
+ )
208
+
209
+ if __name__ == "__main__":
210
+ demo.launch()