Spaces:
Sleeping
Sleeping
| import os, io, json, time, tempfile, zipfile | |
| import gradio as gr | |
| import pandas as pd | |
| from datetime import datetime, timezone | |
| from core.providers import get_provider, ProviderKind | |
| from core.preprocess import normalize_conversation, extract_model_utterances | |
| from core.evaluators import evaluate_all_metrics | |
| from core.fusion import weighted_total | |
| # ----------------------------- | |
| # Defaults | |
| # ----------------------------- | |
| DEFAULT_METRIC_WEIGHTS = { | |
| "trust": 0.20, | |
| "accuracy": 0.25, | |
| "explain": 0.15, | |
| "client_first": 0.15, | |
| "risk_safety": 0.15, | |
| "clarity": 0.10, | |
| } | |
| JUDGE_ALPHA = { | |
| # α = LLM share in fusion per metric (from spec) | |
| "trust": 0.70, | |
| "accuracy": 0.65, | |
| "explain": 0.50, | |
| "client_first": 0.70, | |
| "risk_safety": 0.60, | |
| "clarity": 0.70, | |
| } | |
| # ----------------------------- | |
| # Core runner | |
| # ----------------------------- | |
| def run_eval(conversation: str, | |
| use_openai: bool, | |
| use_anthropic: bool, | |
| w_trust: float, w_accuracy: float, w_explain: float, | |
| w_client: float, w_risk: float, w_clarity: float, | |
| model_openai: str = "gpt-4o", | |
| model_anthropic: str = "claude-3-5-sonnet-20240620"): | |
| if not conversation or conversation.strip() == "": | |
| return None, None, None, None, "Please paste a conversation to evaluate." | |
| # Normalize metric weights | |
| user_weights = { | |
| "trust": w_trust, | |
| "accuracy": w_accuracy, | |
| "explain": w_explain, | |
| "client_first": w_client, | |
| "risk_safety": w_risk, | |
| "clarity": w_clarity, | |
| } | |
| s = sum(user_weights.values()) or 1.0 | |
| for k in user_weights: | |
| user_weights[k] = user_weights[k] / s | |
| # Preprocess conversation | |
| norm = normalize_conversation(conversation) | |
| # Try to isolate model utterances (LLM fallback inside if ambiguous) | |
| model_only = extract_model_utterances(norm, prefer_llm_provider=(model_openai if use_openai else (model_anthropic if use_anthropic else None))) | |
| providers = [] | |
| if use_openai: | |
| providers.append(get_provider(ProviderKind.OPENAI, model_openai)) | |
| if use_anthropic: | |
| providers.append(get_provider(ProviderKind.ANTHROPIC, model_anthropic)) | |
| if not providers: | |
| return None, None, None, None, "Select at least one model provider." | |
| all_tables = [] | |
| compare_rows = [] | |
| token_usage_blocks = [] | |
| json_blobs = {} | |
| for p in providers: | |
| metrics_out, usage, raw_json = evaluate_all_metrics(provider=p, | |
| conversation_text=model_only, | |
| alpha_map=JUDGE_ALPHA) | |
| # Build table | |
| rows = [] | |
| for m, payload in metrics_out.items(): | |
| rows.append({ | |
| "Metric": m, | |
| "LLM Score (1-5)": payload.get("judge_score", None), | |
| "NLP Subscore (0-1)": round(payload.get("nlp_subscore", 0.0), 3), | |
| "Fused (0-10)": round(payload.get("fused_0_10", 0.0), 2), | |
| "Comment": payload.get("comment", "") | |
| }) | |
| df = pd.DataFrame(rows) | |
| total = weighted_total({k: v.get("fused_0_10", 0.0) for k, v in metrics_out.items()}, user_weights) | |
| compare_rows.append({"Model": p.label, **{r["Metric"]: r["Fused (0-10)"] for _, r in df.iterrows()}, "Total (0-10)": round(total, 2)}) | |
| # Token usage | |
| usage_text = f"{p.label}: prompt_tokens={usage.get('prompt',0)}, completion_tokens={usage.get('completion',0)}, total={usage.get('total',0)}" | |
| token_usage_blocks.append(usage_text) | |
| # Persist JSON blob per model | |
| json_blobs[p.label] = raw_json | |
| all_tables.append((p.label, df, round(total, 2))) | |
| # Comparison table | |
| compare_df = pd.DataFrame(compare_rows) | |
| # If 2 models, compute an average row | |
| avg_df = None | |
| if len(providers) > 1: | |
| # Average across numeric columns only | |
| num_cols = [c for c in compare_df.columns if c != "Model"] | |
| avg_row = {"Model": "Average"} | |
| for c in num_cols: | |
| avg_row[c] = round(compare_df[c].mean(), 2) | |
| avg_df = pd.DataFrame([avg_row]) | |
| # Build downloadable CSV and ZIP | |
| ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") | |
| out_dir = tempfile.mkdtemp(prefix="financeeval_") | |
| # Write per-model CSVs | |
| csv_paths = [] | |
| for label, df, total in all_tables: | |
| pth = os.path.join(out_dir, f"results_{label}_{ts}.csv") | |
| df2 = df.copy() | |
| df2.loc[len(df2)] = {"Metric": "TOTAL", "LLM Score (1-5)": "-", "NLP Subscore (0-1)": "-", "Fused (0-10)": total, "Comment": ""} | |
| df2.to_csv(pth, index=False) | |
| csv_paths.append(pth) | |
| # Comparison CSV | |
| comp_path = os.path.join(out_dir, f"comparison_{ts}.csv") | |
| compare_df.to_csv(comp_path, index=False) | |
| # JSON outputs | |
| json_path = os.path.join(out_dir, f"judgments_{ts}.json") | |
| with open(json_path, 'w') as f: | |
| json.dump(json_blobs, f, indent=2) | |
| # Make ZIP | |
| zip_path = os.path.join(out_dir, f"financeeval_{ts}.zip") | |
| with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf: | |
| for pth in csv_paths + [comp_path, json_path]: | |
| zf.write(pth, arcname=os.path.basename(pth)) | |
| # Return artifacts | |
| merged_tables = [] | |
| for label, df, total in all_tables: | |
| merged_tables.append(pd.DataFrame({"Metric": [f"— {label} —"], "LLM Score (1-5)": [""], "NLP Subscore (0-1)": [""], "Fused (0-10)": [""], "Comment": [""]})) | |
| merged_tables.append(df) | |
| merged_df = pd.concat(merged_tables, ignore_index=True) | |
| usage_text_all = "\n".join(token_usage_blocks) | |
| return merged_df, compare_df, (avg_df if avg_df is not None else pd.DataFrame()), zip_path, usage_text_all | |
| # ----------------------------- | |
| # UI | |
| # ----------------------------- | |
| with gr.Blocks(title="FinanceEval – Hybrid Judge (Gradio)") as demo: | |
| gr.Markdown(""" | |
| # 🔎 FinanceEval – Hybrid Evaluation (Gradio / HF Spaces) | |
| Paste a finance conversation. Choose one or both judge models (OpenAI GPT‑4o, Claude 3.5 Sonnet). Adjust metric weights. Click **Evaluate**. | |
| """) | |
| with gr.Row(): | |
| conversation = gr.Textbox(label="Conversation", lines=16, placeholder="Paste the full transcript here...") | |
| with gr.Accordion("Model Selection", open=True): | |
| with gr.Row(): | |
| use_openai = gr.Checkbox(value=True, label="Use OpenAI GPT‑4o") | |
| use_anthropic = gr.Checkbox(value=False, label="Use Claude 3.5 Sonnet") | |
| with gr.Row(): | |
| model_openai = gr.Textbox(value="gpt-4o", label="OpenAI model name") | |
| model_anthropic = gr.Textbox(value="claude-3-5-sonnet-20240620", label="Anthropic model name") | |
| gr.Markdown("**Secrets**: Set `OPENAI_API_KEY` and / or `ANTHROPIC_API_KEY` in your Space settings.") | |
| with gr.Accordion("Metric Weights (affect only the TOTAL)", open=True): | |
| with gr.Row(): | |
| w_trust = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["trust"], step=0.01, label="Trust") | |
| w_accuracy = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["accuracy"], step=0.01, label="Accuracy") | |
| w_explain = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["explain"], step=0.01, label="Explainability") | |
| with gr.Row(): | |
| w_client = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["client_first"], step=0.01, label="Client‑First") | |
| w_risk = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["risk_safety"], step=0.01, label="Risk Safety") | |
| w_clarity = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["clarity"], step=0.01, label="Clarity") | |
| gr.Markdown("Weights are normalized to sum to 1 before computing the TOTAL.") | |
| run_btn = gr.Button("Evaluate") | |
| with gr.Tab("Per‑Model Results"): | |
| table_out = gr.Dataframe(label="Metric Scores & Comments (stacked per model)") | |
| with gr.Tab("Comparison"): | |
| compare_out = gr.Dataframe(label="Model Comparison (per metric + TOTAL)") | |
| avg_out = gr.Dataframe(label="Average (if multiple models)") | |
| with gr.Tab("Downloads & Usage"): | |
| zip_file = gr.File(label="Download ZIP (CSVs + JSON)") | |
| usage_text = gr.Textbox(label="Token Usage", lines=4) | |
| run_btn.click( | |
| fn=run_eval, | |
| inputs=[conversation, use_openai, use_anthropic, | |
| w_trust, w_accuracy, w_explain, w_client, w_risk, w_clarity, | |
| model_openai, model_anthropic], | |
| outputs=[table_out, compare_out, avg_out, zip_file, usage_text] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |