Spaces:

BrainDrive
/

FinanceEval

Sleeping

App Files Files Community

FinanceEval / app.py

navaneethkrishnan

Create app.py

2c49fd3 verified about 2 months ago

raw

history blame

8.6 kB

	import os, io, json, time, tempfile, zipfile
	import gradio as gr
	import pandas as pd
	from datetime import datetime, timezone

	from core.providers import get_provider, ProviderKind
	from core.preprocess import normalize_conversation, extract_model_utterances
	from core.evaluators import evaluate_all_metrics
	from core.fusion import weighted_total

	# -----------------------------
	# Defaults
	# -----------------------------
	DEFAULT_METRIC_WEIGHTS = {
	"trust": 0.20,
	"accuracy": 0.25,
	"explain": 0.15,
	"client_first": 0.15,
	"risk_safety": 0.15,
	"clarity": 0.10,
	}

	JUDGE_ALPHA = {
	# α = LLM share in fusion per metric (from spec)
	"trust": 0.70,
	"accuracy": 0.65,
	"explain": 0.50,
	"client_first": 0.70,
	"risk_safety": 0.60,
	"clarity": 0.70,
	}

	# -----------------------------
	# Core runner
	# -----------------------------

	def run_eval(conversation: str,
	use_openai: bool,
	use_anthropic: bool,
	w_trust: float, w_accuracy: float, w_explain: float,
	w_client: float, w_risk: float, w_clarity: float,
	model_openai: str = "gpt-4o",
	model_anthropic: str = "claude-3-5-sonnet-20240620"):
	if not conversation or conversation.strip() == "":
	return None, None, None, None, "Please paste a conversation to evaluate."

	# Normalize metric weights
	user_weights = {
	"trust": w_trust,
	"accuracy": w_accuracy,
	"explain": w_explain,
	"client_first": w_client,
	"risk_safety": w_risk,
	"clarity": w_clarity,
	}
	s = sum(user_weights.values()) or 1.0
	for k in user_weights:
	user_weights[k] = user_weights[k] / s

	# Preprocess conversation
	norm = normalize_conversation(conversation)
	# Try to isolate model utterances (LLM fallback inside if ambiguous)
	model_only = extract_model_utterances(norm, prefer_llm_provider=(model_openai if use_openai else (model_anthropic if use_anthropic else None)))

	providers = []
	if use_openai:
	providers.append(get_provider(ProviderKind.OPENAI, model_openai))
	if use_anthropic:
	providers.append(get_provider(ProviderKind.ANTHROPIC, model_anthropic))

	if not providers:
	return None, None, None, None, "Select at least one model provider."

	all_tables = []
	compare_rows = []
	token_usage_blocks = []
	json_blobs = {}

	for p in providers:
	metrics_out, usage, raw_json = evaluate_all_metrics(provider=p,
	conversation_text=model_only,
	alpha_map=JUDGE_ALPHA)
	# Build table
	rows = []
	for m, payload in metrics_out.items():
	rows.append({
	"Metric": m,
	"LLM Score (1-5)": payload.get("judge_score", None),
	"NLP Subscore (0-1)": round(payload.get("nlp_subscore", 0.0), 3),
	"Fused (0-10)": round(payload.get("fused_0_10", 0.0), 2),
	"Comment": payload.get("comment", "")
	})
	df = pd.DataFrame(rows)
	total = weighted_total({k: v.get("fused_0_10", 0.0) for k, v in metrics_out.items()}, user_weights)
	compare_rows.append({"Model": p.label, **{r["Metric"]: r["Fused (0-10)"] for _, r in df.iterrows()}, "Total (0-10)": round(total, 2)})

	# Token usage
	usage_text = f"{p.label}: prompt_tokens={usage.get('prompt',0)}, completion_tokens={usage.get('completion',0)}, total={usage.get('total',0)}"
	token_usage_blocks.append(usage_text)

	# Persist JSON blob per model
	json_blobs[p.label] = raw_json

	all_tables.append((p.label, df, round(total, 2)))

	# Comparison table
	compare_df = pd.DataFrame(compare_rows)

	# If 2 models, compute an average row
	avg_df = None
	if len(providers) > 1:
	# Average across numeric columns only
	num_cols = [c for c in compare_df.columns if c != "Model"]
	avg_row = {"Model": "Average"}
	for c in num_cols:
	avg_row[c] = round(compare_df[c].mean(), 2)
	avg_df = pd.DataFrame([avg_row])

	# Build downloadable CSV and ZIP
	ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
	out_dir = tempfile.mkdtemp(prefix="financeeval_")

	# Write per-model CSVs
	csv_paths = []
	for label, df, total in all_tables:
	pth = os.path.join(out_dir, f"results_{label}_{ts}.csv")
	df2 = df.copy()
	df2.loc[len(df2)] = {"Metric": "TOTAL", "LLM Score (1-5)": "-", "NLP Subscore (0-1)": "-", "Fused (0-10)": total, "Comment": ""}
	df2.to_csv(pth, index=False)
	csv_paths.append(pth)

	# Comparison CSV
	comp_path = os.path.join(out_dir, f"comparison_{ts}.csv")
	compare_df.to_csv(comp_path, index=False)

	# JSON outputs
	json_path = os.path.join(out_dir, f"judgments_{ts}.json")
	with open(json_path, 'w') as f:
	json.dump(json_blobs, f, indent=2)

	# Make ZIP
	zip_path = os.path.join(out_dir, f"financeeval_{ts}.zip")
	with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
	for pth in csv_paths + [comp_path, json_path]:
	zf.write(pth, arcname=os.path.basename(pth))

	# Return artifacts
	merged_tables = []
	for label, df, total in all_tables:
	merged_tables.append(pd.DataFrame({"Metric": [f"— {label} —"], "LLM Score (1-5)": [""], "NLP Subscore (0-1)": [""], "Fused (0-10)": [""], "Comment": [""]}))
	merged_tables.append(df)
	merged_df = pd.concat(merged_tables, ignore_index=True)

	usage_text_all = "\n".join(token_usage_blocks)

	return merged_df, compare_df, (avg_df if avg_df is not None else pd.DataFrame()), zip_path, usage_text_all

	# -----------------------------
	# UI
	# -----------------------------
	with gr.Blocks(title="FinanceEval – Hybrid Judge (Gradio)") as demo:
	gr.Markdown("""
	# 🔎 FinanceEval – Hybrid Evaluation (Gradio / HF Spaces)
	Paste a finance conversation. Choose one or both judge models (OpenAI GPT‑4o, Claude 3.5 Sonnet). Adjust metric weights. Click Evaluate.
	""")

	with gr.Row():
	conversation = gr.Textbox(label="Conversation", lines=16, placeholder="Paste the full transcript here...")

	with gr.Accordion("Model Selection", open=True):
	with gr.Row():
	use_openai = gr.Checkbox(value=True, label="Use OpenAI GPT‑4o")
	use_anthropic = gr.Checkbox(value=False, label="Use Claude 3.5 Sonnet")
	with gr.Row():
	model_openai = gr.Textbox(value="gpt-4o", label="OpenAI model name")
	model_anthropic = gr.Textbox(value="claude-3-5-sonnet-20240620", label="Anthropic model name")
	gr.Markdown("Secrets: Set `OPENAI_API_KEY` and / or `ANTHROPIC_API_KEY` in your Space settings.")

	with gr.Accordion("Metric Weights (affect only the TOTAL)", open=True):
	with gr.Row():
	w_trust = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["trust"], step=0.01, label="Trust")
	w_accuracy = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["accuracy"], step=0.01, label="Accuracy")
	w_explain = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["explain"], step=0.01, label="Explainability")
	with gr.Row():
	w_client = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["client_first"], step=0.01, label="Client‑First")
	w_risk = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["risk_safety"], step=0.01, label="Risk Safety")
	w_clarity = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["clarity"], step=0.01, label="Clarity")
	gr.Markdown("Weights are normalized to sum to 1 before computing the TOTAL.")

	run_btn = gr.Button("Evaluate")

	with gr.Tab("Per‑Model Results"):
	table_out = gr.Dataframe(label="Metric Scores & Comments (stacked per model)")
	with gr.Tab("Comparison"):
	compare_out = gr.Dataframe(label="Model Comparison (per metric + TOTAL)")
	avg_out = gr.Dataframe(label="Average (if multiple models)")
	with gr.Tab("Downloads & Usage"):
	zip_file = gr.File(label="Download ZIP (CSVs + JSON)")
	usage_text = gr.Textbox(label="Token Usage", lines=4)

	run_btn.click(
	fn=run_eval,
	inputs=[conversation, use_openai, use_anthropic,
	w_trust, w_accuracy, w_explain, w_client, w_risk, w_clarity,
	model_openai, model_anthropic],
	outputs=[table_out, compare_out, avg_out, zip_file, usage_text]
	)

	if __name__ == "__main__":
	demo.launch()