FinanceEval / app.py
navaneethkrishnan's picture
Create app.py
2c49fd3 verified
raw
history blame
8.6 kB
import os, io, json, time, tempfile, zipfile
import gradio as gr
import pandas as pd
from datetime import datetime, timezone
from core.providers import get_provider, ProviderKind
from core.preprocess import normalize_conversation, extract_model_utterances
from core.evaluators import evaluate_all_metrics
from core.fusion import weighted_total
# -----------------------------
# Defaults
# -----------------------------
DEFAULT_METRIC_WEIGHTS = {
"trust": 0.20,
"accuracy": 0.25,
"explain": 0.15,
"client_first": 0.15,
"risk_safety": 0.15,
"clarity": 0.10,
}
JUDGE_ALPHA = {
# α = LLM share in fusion per metric (from spec)
"trust": 0.70,
"accuracy": 0.65,
"explain": 0.50,
"client_first": 0.70,
"risk_safety": 0.60,
"clarity": 0.70,
}
# -----------------------------
# Core runner
# -----------------------------
def run_eval(conversation: str,
use_openai: bool,
use_anthropic: bool,
w_trust: float, w_accuracy: float, w_explain: float,
w_client: float, w_risk: float, w_clarity: float,
model_openai: str = "gpt-4o",
model_anthropic: str = "claude-3-5-sonnet-20240620"):
if not conversation or conversation.strip() == "":
return None, None, None, None, "Please paste a conversation to evaluate."
# Normalize metric weights
user_weights = {
"trust": w_trust,
"accuracy": w_accuracy,
"explain": w_explain,
"client_first": w_client,
"risk_safety": w_risk,
"clarity": w_clarity,
}
s = sum(user_weights.values()) or 1.0
for k in user_weights:
user_weights[k] = user_weights[k] / s
# Preprocess conversation
norm = normalize_conversation(conversation)
# Try to isolate model utterances (LLM fallback inside if ambiguous)
model_only = extract_model_utterances(norm, prefer_llm_provider=(model_openai if use_openai else (model_anthropic if use_anthropic else None)))
providers = []
if use_openai:
providers.append(get_provider(ProviderKind.OPENAI, model_openai))
if use_anthropic:
providers.append(get_provider(ProviderKind.ANTHROPIC, model_anthropic))
if not providers:
return None, None, None, None, "Select at least one model provider."
all_tables = []
compare_rows = []
token_usage_blocks = []
json_blobs = {}
for p in providers:
metrics_out, usage, raw_json = evaluate_all_metrics(provider=p,
conversation_text=model_only,
alpha_map=JUDGE_ALPHA)
# Build table
rows = []
for m, payload in metrics_out.items():
rows.append({
"Metric": m,
"LLM Score (1-5)": payload.get("judge_score", None),
"NLP Subscore (0-1)": round(payload.get("nlp_subscore", 0.0), 3),
"Fused (0-10)": round(payload.get("fused_0_10", 0.0), 2),
"Comment": payload.get("comment", "")
})
df = pd.DataFrame(rows)
total = weighted_total({k: v.get("fused_0_10", 0.0) for k, v in metrics_out.items()}, user_weights)
compare_rows.append({"Model": p.label, **{r["Metric"]: r["Fused (0-10)"] for _, r in df.iterrows()}, "Total (0-10)": round(total, 2)})
# Token usage
usage_text = f"{p.label}: prompt_tokens={usage.get('prompt',0)}, completion_tokens={usage.get('completion',0)}, total={usage.get('total',0)}"
token_usage_blocks.append(usage_text)
# Persist JSON blob per model
json_blobs[p.label] = raw_json
all_tables.append((p.label, df, round(total, 2)))
# Comparison table
compare_df = pd.DataFrame(compare_rows)
# If 2 models, compute an average row
avg_df = None
if len(providers) > 1:
# Average across numeric columns only
num_cols = [c for c in compare_df.columns if c != "Model"]
avg_row = {"Model": "Average"}
for c in num_cols:
avg_row[c] = round(compare_df[c].mean(), 2)
avg_df = pd.DataFrame([avg_row])
# Build downloadable CSV and ZIP
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
out_dir = tempfile.mkdtemp(prefix="financeeval_")
# Write per-model CSVs
csv_paths = []
for label, df, total in all_tables:
pth = os.path.join(out_dir, f"results_{label}_{ts}.csv")
df2 = df.copy()
df2.loc[len(df2)] = {"Metric": "TOTAL", "LLM Score (1-5)": "-", "NLP Subscore (0-1)": "-", "Fused (0-10)": total, "Comment": ""}
df2.to_csv(pth, index=False)
csv_paths.append(pth)
# Comparison CSV
comp_path = os.path.join(out_dir, f"comparison_{ts}.csv")
compare_df.to_csv(comp_path, index=False)
# JSON outputs
json_path = os.path.join(out_dir, f"judgments_{ts}.json")
with open(json_path, 'w') as f:
json.dump(json_blobs, f, indent=2)
# Make ZIP
zip_path = os.path.join(out_dir, f"financeeval_{ts}.zip")
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
for pth in csv_paths + [comp_path, json_path]:
zf.write(pth, arcname=os.path.basename(pth))
# Return artifacts
merged_tables = []
for label, df, total in all_tables:
merged_tables.append(pd.DataFrame({"Metric": [f"— {label} —"], "LLM Score (1-5)": [""], "NLP Subscore (0-1)": [""], "Fused (0-10)": [""], "Comment": [""]}))
merged_tables.append(df)
merged_df = pd.concat(merged_tables, ignore_index=True)
usage_text_all = "\n".join(token_usage_blocks)
return merged_df, compare_df, (avg_df if avg_df is not None else pd.DataFrame()), zip_path, usage_text_all
# -----------------------------
# UI
# -----------------------------
with gr.Blocks(title="FinanceEval – Hybrid Judge (Gradio)") as demo:
gr.Markdown("""
# 🔎 FinanceEval – Hybrid Evaluation (Gradio / HF Spaces)
Paste a finance conversation. Choose one or both judge models (OpenAI GPT‑4o, Claude 3.5 Sonnet). Adjust metric weights. Click **Evaluate**.
""")
with gr.Row():
conversation = gr.Textbox(label="Conversation", lines=16, placeholder="Paste the full transcript here...")
with gr.Accordion("Model Selection", open=True):
with gr.Row():
use_openai = gr.Checkbox(value=True, label="Use OpenAI GPT‑4o")
use_anthropic = gr.Checkbox(value=False, label="Use Claude 3.5 Sonnet")
with gr.Row():
model_openai = gr.Textbox(value="gpt-4o", label="OpenAI model name")
model_anthropic = gr.Textbox(value="claude-3-5-sonnet-20240620", label="Anthropic model name")
gr.Markdown("**Secrets**: Set `OPENAI_API_KEY` and / or `ANTHROPIC_API_KEY` in your Space settings.")
with gr.Accordion("Metric Weights (affect only the TOTAL)", open=True):
with gr.Row():
w_trust = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["trust"], step=0.01, label="Trust")
w_accuracy = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["accuracy"], step=0.01, label="Accuracy")
w_explain = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["explain"], step=0.01, label="Explainability")
with gr.Row():
w_client = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["client_first"], step=0.01, label="Client‑First")
w_risk = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["risk_safety"], step=0.01, label="Risk Safety")
w_clarity = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["clarity"], step=0.01, label="Clarity")
gr.Markdown("Weights are normalized to sum to 1 before computing the TOTAL.")
run_btn = gr.Button("Evaluate")
with gr.Tab("Per‑Model Results"):
table_out = gr.Dataframe(label="Metric Scores & Comments (stacked per model)")
with gr.Tab("Comparison"):
compare_out = gr.Dataframe(label="Model Comparison (per metric + TOTAL)")
avg_out = gr.Dataframe(label="Average (if multiple models)")
with gr.Tab("Downloads & Usage"):
zip_file = gr.File(label="Download ZIP (CSVs + JSON)")
usage_text = gr.Textbox(label="Token Usage", lines=4)
run_btn.click(
fn=run_eval,
inputs=[conversation, use_openai, use_anthropic,
w_trust, w_accuracy, w_explain, w_client, w_risk, w_clarity,
model_openai, model_anthropic],
outputs=[table_out, compare_out, avg_out, zip_file, usage_text]
)
if __name__ == "__main__":
demo.launch()