Spaces:

BrainDrive
/

FinanceEval

Sleeping

App Files Files Community

navaneethkrishnan commited on Sep 11

Commit

2c49fd3

verified ·

1 Parent(s): af3a8fe

Create app.py

Browse files

Files changed (1) hide show

app.py +210 -0

app.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import os, io, json, time, tempfile, zipfile
+import gradio as gr
+import pandas as pd
+from datetime import datetime, timezone
+from core.providers import get_provider, ProviderKind
+from core.preprocess import normalize_conversation, extract_model_utterances
+from core.evaluators import evaluate_all_metrics
+from core.fusion import weighted_total
+# -----------------------------
+# Defaults
+# -----------------------------
+DEFAULT_METRIC_WEIGHTS = {
+    "trust": 0.20,
+    "accuracy": 0.25,
+    "explain": 0.15,
+    "client_first": 0.15,
+    "risk_safety": 0.15,
+    "clarity": 0.10,
+}
+JUDGE_ALPHA = {
+    # α = LLM share in fusion per metric (from spec)
+    "trust": 0.70,
+    "accuracy": 0.65,
+    "explain": 0.50,
+    "client_first": 0.70,
+    "risk_safety": 0.60,
+    "clarity": 0.70,
+}
+# -----------------------------
+# Core runner
+# -----------------------------
+def run_eval(conversation: str,
+             use_openai: bool,
+             use_anthropic: bool,
+             w_trust: float, w_accuracy: float, w_explain: float,
+             w_client: float, w_risk: float, w_clarity: float,
+             model_openai: str = "gpt-4o",
+             model_anthropic: str = "claude-3-5-sonnet-20240620"):
+    if not conversation or conversation.strip() == "":
+        return None, None, None, None, "Please paste a conversation to evaluate."
+    # Normalize metric weights
+    user_weights = {
+        "trust": w_trust,
+        "accuracy": w_accuracy,
+        "explain": w_explain,
+        "client_first": w_client,
+        "risk_safety": w_risk,
+        "clarity": w_clarity,
+    }
+    s = sum(user_weights.values()) or 1.0
+    for k in user_weights:
+        user_weights[k] = user_weights[k] / s
+    # Preprocess conversation
+    norm = normalize_conversation(conversation)
+    # Try to isolate model utterances (LLM fallback inside if ambiguous)
+    model_only = extract_model_utterances(norm, prefer_llm_provider=(model_openai if use_openai else (model_anthropic if use_anthropic else None)))
+    providers = []
+    if use_openai:
+        providers.append(get_provider(ProviderKind.OPENAI, model_openai))
+    if use_anthropic:
+        providers.append(get_provider(ProviderKind.ANTHROPIC, model_anthropic))
+    if not providers:
+        return None, None, None, None, "Select at least one model provider."
+    all_tables = []
+    compare_rows = []
+    token_usage_blocks = []
+    json_blobs = {}
+    for p in providers:
+        metrics_out, usage, raw_json = evaluate_all_metrics(provider=p,
+                                                            conversation_text=model_only,
+                                                            alpha_map=JUDGE_ALPHA)
+        # Build table
+        rows = []
+        for m, payload in metrics_out.items():
+            rows.append({
+                "Metric": m,
+                "LLM Score (1-5)": payload.get("judge_score", None),
+                "NLP Subscore (0-1)": round(payload.get("nlp_subscore", 0.0), 3),
+                "Fused (0-10)": round(payload.get("fused_0_10", 0.0), 2),
+                "Comment": payload.get("comment", "")
+            })
+        df = pd.DataFrame(rows)
+        total = weighted_total({k: v.get("fused_0_10", 0.0) for k, v in metrics_out.items()}, user_weights)
+        compare_rows.append({"Model": p.label, **{r["Metric"]: r["Fused (0-10)"] for _, r in df.iterrows()}, "Total (0-10)": round(total, 2)})
+        # Token usage
+        usage_text = f"{p.label}: prompt_tokens={usage.get('prompt',0)}, completion_tokens={usage.get('completion',0)}, total={usage.get('total',0)}"
+        token_usage_blocks.append(usage_text)
+        # Persist JSON blob per model
+        json_blobs[p.label] = raw_json
+        all_tables.append((p.label, df, round(total, 2)))
+    # Comparison table
+    compare_df = pd.DataFrame(compare_rows)
+    # If 2 models, compute an average row
+    avg_df = None
+    if len(providers) > 1:
+        # Average across numeric columns only
+        num_cols = [c for c in compare_df.columns if c != "Model"]
+        avg_row = {"Model": "Average"}
+        for c in num_cols:
+            avg_row[c] = round(compare_df[c].mean(), 2)
+        avg_df = pd.DataFrame([avg_row])
+    # Build downloadable CSV and ZIP
+    ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    out_dir = tempfile.mkdtemp(prefix="financeeval_")
+    # Write per-model CSVs
+    csv_paths = []
+    for label, df, total in all_tables:
+        pth = os.path.join(out_dir, f"results_{label}_{ts}.csv")
+        df2 = df.copy()
+        df2.loc[len(df2)] = {"Metric": "TOTAL", "LLM Score (1-5)": "-", "NLP Subscore (0-1)": "-", "Fused (0-10)": total, "Comment": ""}
+        df2.to_csv(pth, index=False)
+        csv_paths.append(pth)
+    # Comparison CSV
+    comp_path = os.path.join(out_dir, f"comparison_{ts}.csv")
+    compare_df.to_csv(comp_path, index=False)
+    # JSON outputs
+    json_path = os.path.join(out_dir, f"judgments_{ts}.json")
+    with open(json_path, 'w') as f:
+        json.dump(json_blobs, f, indent=2)
+    # Make ZIP
+    zip_path = os.path.join(out_dir, f"financeeval_{ts}.zip")
+    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for pth in csv_paths + [comp_path, json_path]:
+            zf.write(pth, arcname=os.path.basename(pth))
+    # Return artifacts
+    merged_tables = []
+    for label, df, total in all_tables:
+        merged_tables.append(pd.DataFrame({"Metric": [f"— {label} —"], "LLM Score (1-5)": [""], "NLP Subscore (0-1)": [""], "Fused (0-10)": [""], "Comment": [""]}))
+        merged_tables.append(df)
+    merged_df = pd.concat(merged_tables, ignore_index=True)
+    usage_text_all = "\n".join(token_usage_blocks)
+    return merged_df, compare_df, (avg_df if avg_df is not None else pd.DataFrame()), zip_path, usage_text_all
+# -----------------------------
+# UI
+# -----------------------------
+with gr.Blocks(title="FinanceEval – Hybrid Judge (Gradio)") as demo:
+    gr.Markdown("""
+    # 🔎 FinanceEval – Hybrid Evaluation (Gradio / HF Spaces)
+    Paste a finance conversation. Choose one or both judge models (OpenAI GPT‑4o, Claude 3.5 Sonnet). Adjust metric weights. Click **Evaluate**.
+    """)
+    with gr.Row():
+        conversation = gr.Textbox(label="Conversation", lines=16, placeholder="Paste the full transcript here...")
+    with gr.Accordion("Model Selection", open=True):
+        with gr.Row():
+            use_openai = gr.Checkbox(value=True, label="Use OpenAI GPT‑4o")
+            use_anthropic = gr.Checkbox(value=False, label="Use Claude 3.5 Sonnet")
+        with gr.Row():
+            model_openai = gr.Textbox(value="gpt-4o", label="OpenAI model name")
+            model_anthropic = gr.Textbox(value="claude-3-5-sonnet-20240620", label="Anthropic model name")
+        gr.Markdown("**Secrets**: Set `OPENAI_API_KEY` and / or `ANTHROPIC_API_KEY` in your Space settings.")
+    with gr.Accordion("Metric Weights (affect only the TOTAL)", open=True):
+        with gr.Row():
+            w_trust = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["trust"], step=0.01, label="Trust")
+            w_accuracy = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["accuracy"], step=0.01, label="Accuracy")
+            w_explain = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["explain"], step=0.01, label="Explainability")
+        with gr.Row():
+            w_client = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["client_first"], step=0.01, label="Client‑First")
+            w_risk = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["risk_safety"], step=0.01, label="Risk Safety")
+            w_clarity = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["clarity"], step=0.01, label="Clarity")
+        gr.Markdown("Weights are normalized to sum to 1 before computing the TOTAL.")
+    run_btn = gr.Button("Evaluate")
+    with gr.Tab("Per‑Model Results"):
+        table_out = gr.Dataframe(label="Metric Scores & Comments (stacked per model)")
+    with gr.Tab("Comparison"):
+        compare_out = gr.Dataframe(label="Model Comparison (per metric + TOTAL)")
+        avg_out = gr.Dataframe(label="Average (if multiple models)")
+    with gr.Tab("Downloads & Usage"):
+        zip_file = gr.File(label="Download ZIP (CSVs + JSON)")
+        usage_text = gr.Textbox(label="Token Usage", lines=4)
+    run_btn.click(
+        fn=run_eval,
+        inputs=[conversation, use_openai, use_anthropic,
+                w_trust, w_accuracy, w_explain, w_client, w_risk, w_clarity,
+                model_openai, model_anthropic],
+        outputs=[table_out, compare_out, avg_out, zip_file, usage_text]
+    )
+if __name__ == "__main__":
+    demo.launch()