Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Create app.py
Browse files
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,210 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os, io, json, time, tempfile, zipfile
         | 
| 2 | 
            +
            import gradio as gr
         | 
| 3 | 
            +
            import pandas as pd
         | 
| 4 | 
            +
            from datetime import datetime, timezone
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from core.providers import get_provider, ProviderKind
         | 
| 7 | 
            +
            from core.preprocess import normalize_conversation, extract_model_utterances
         | 
| 8 | 
            +
            from core.evaluators import evaluate_all_metrics
         | 
| 9 | 
            +
            from core.fusion import weighted_total
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            # -----------------------------
         | 
| 12 | 
            +
            # Defaults
         | 
| 13 | 
            +
            # -----------------------------
         | 
| 14 | 
            +
            DEFAULT_METRIC_WEIGHTS = {
         | 
| 15 | 
            +
                "trust": 0.20,
         | 
| 16 | 
            +
                "accuracy": 0.25,
         | 
| 17 | 
            +
                "explain": 0.15,
         | 
| 18 | 
            +
                "client_first": 0.15,
         | 
| 19 | 
            +
                "risk_safety": 0.15,
         | 
| 20 | 
            +
                "clarity": 0.10,
         | 
| 21 | 
            +
            }
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            JUDGE_ALPHA = {
         | 
| 24 | 
            +
                # α = LLM share in fusion per metric (from spec)
         | 
| 25 | 
            +
                "trust": 0.70,
         | 
| 26 | 
            +
                "accuracy": 0.65,
         | 
| 27 | 
            +
                "explain": 0.50,
         | 
| 28 | 
            +
                "client_first": 0.70,
         | 
| 29 | 
            +
                "risk_safety": 0.60,
         | 
| 30 | 
            +
                "clarity": 0.70,
         | 
| 31 | 
            +
            }
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            # -----------------------------
         | 
| 34 | 
            +
            # Core runner
         | 
| 35 | 
            +
            # -----------------------------
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            def run_eval(conversation: str,
         | 
| 38 | 
            +
                         use_openai: bool,
         | 
| 39 | 
            +
                         use_anthropic: bool,
         | 
| 40 | 
            +
                         w_trust: float, w_accuracy: float, w_explain: float,
         | 
| 41 | 
            +
                         w_client: float, w_risk: float, w_clarity: float,
         | 
| 42 | 
            +
                         model_openai: str = "gpt-4o",
         | 
| 43 | 
            +
                         model_anthropic: str = "claude-3-5-sonnet-20240620"):
         | 
| 44 | 
            +
                if not conversation or conversation.strip() == "":
         | 
| 45 | 
            +
                    return None, None, None, None, "Please paste a conversation to evaluate."
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                # Normalize metric weights
         | 
| 48 | 
            +
                user_weights = {
         | 
| 49 | 
            +
                    "trust": w_trust,
         | 
| 50 | 
            +
                    "accuracy": w_accuracy,
         | 
| 51 | 
            +
                    "explain": w_explain,
         | 
| 52 | 
            +
                    "client_first": w_client,
         | 
| 53 | 
            +
                    "risk_safety": w_risk,
         | 
| 54 | 
            +
                    "clarity": w_clarity,
         | 
| 55 | 
            +
                }
         | 
| 56 | 
            +
                s = sum(user_weights.values()) or 1.0
         | 
| 57 | 
            +
                for k in user_weights:
         | 
| 58 | 
            +
                    user_weights[k] = user_weights[k] / s
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                # Preprocess conversation
         | 
| 61 | 
            +
                norm = normalize_conversation(conversation)
         | 
| 62 | 
            +
                # Try to isolate model utterances (LLM fallback inside if ambiguous)
         | 
| 63 | 
            +
                model_only = extract_model_utterances(norm, prefer_llm_provider=(model_openai if use_openai else (model_anthropic if use_anthropic else None)))
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                providers = []
         | 
| 66 | 
            +
                if use_openai:
         | 
| 67 | 
            +
                    providers.append(get_provider(ProviderKind.OPENAI, model_openai))
         | 
| 68 | 
            +
                if use_anthropic:
         | 
| 69 | 
            +
                    providers.append(get_provider(ProviderKind.ANTHROPIC, model_anthropic))
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                if not providers:
         | 
| 72 | 
            +
                    return None, None, None, None, "Select at least one model provider."
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                all_tables = []
         | 
| 75 | 
            +
                compare_rows = []
         | 
| 76 | 
            +
                token_usage_blocks = []
         | 
| 77 | 
            +
                json_blobs = {}
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                for p in providers:
         | 
| 80 | 
            +
                    metrics_out, usage, raw_json = evaluate_all_metrics(provider=p,
         | 
| 81 | 
            +
                                                                        conversation_text=model_only,
         | 
| 82 | 
            +
                                                                        alpha_map=JUDGE_ALPHA)
         | 
| 83 | 
            +
                    # Build table
         | 
| 84 | 
            +
                    rows = []
         | 
| 85 | 
            +
                    for m, payload in metrics_out.items():
         | 
| 86 | 
            +
                        rows.append({
         | 
| 87 | 
            +
                            "Metric": m,
         | 
| 88 | 
            +
                            "LLM Score (1-5)": payload.get("judge_score", None),
         | 
| 89 | 
            +
                            "NLP Subscore (0-1)": round(payload.get("nlp_subscore", 0.0), 3),
         | 
| 90 | 
            +
                            "Fused (0-10)": round(payload.get("fused_0_10", 0.0), 2),
         | 
| 91 | 
            +
                            "Comment": payload.get("comment", "")
         | 
| 92 | 
            +
                        })
         | 
| 93 | 
            +
                    df = pd.DataFrame(rows)
         | 
| 94 | 
            +
                    total = weighted_total({k: v.get("fused_0_10", 0.0) for k, v in metrics_out.items()}, user_weights)
         | 
| 95 | 
            +
                    compare_rows.append({"Model": p.label, **{r["Metric"]: r["Fused (0-10)"] for _, r in df.iterrows()}, "Total (0-10)": round(total, 2)})
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                    # Token usage
         | 
| 98 | 
            +
                    usage_text = f"{p.label}: prompt_tokens={usage.get('prompt',0)}, completion_tokens={usage.get('completion',0)}, total={usage.get('total',0)}"
         | 
| 99 | 
            +
                    token_usage_blocks.append(usage_text)
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                    # Persist JSON blob per model
         | 
| 102 | 
            +
                    json_blobs[p.label] = raw_json
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                    all_tables.append((p.label, df, round(total, 2)))
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                # Comparison table
         | 
| 107 | 
            +
                compare_df = pd.DataFrame(compare_rows)
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                # If 2 models, compute an average row
         | 
| 110 | 
            +
                avg_df = None
         | 
| 111 | 
            +
                if len(providers) > 1:
         | 
| 112 | 
            +
                    # Average across numeric columns only
         | 
| 113 | 
            +
                    num_cols = [c for c in compare_df.columns if c != "Model"]
         | 
| 114 | 
            +
                    avg_row = {"Model": "Average"}
         | 
| 115 | 
            +
                    for c in num_cols:
         | 
| 116 | 
            +
                        avg_row[c] = round(compare_df[c].mean(), 2)
         | 
| 117 | 
            +
                    avg_df = pd.DataFrame([avg_row])
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                # Build downloadable CSV and ZIP
         | 
| 120 | 
            +
                ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
         | 
| 121 | 
            +
                out_dir = tempfile.mkdtemp(prefix="financeeval_")
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                # Write per-model CSVs
         | 
| 124 | 
            +
                csv_paths = []
         | 
| 125 | 
            +
                for label, df, total in all_tables:
         | 
| 126 | 
            +
                    pth = os.path.join(out_dir, f"results_{label}_{ts}.csv")
         | 
| 127 | 
            +
                    df2 = df.copy()
         | 
| 128 | 
            +
                    df2.loc[len(df2)] = {"Metric": "TOTAL", "LLM Score (1-5)": "-", "NLP Subscore (0-1)": "-", "Fused (0-10)": total, "Comment": ""}
         | 
| 129 | 
            +
                    df2.to_csv(pth, index=False)
         | 
| 130 | 
            +
                    csv_paths.append(pth)
         | 
| 131 | 
            +
             | 
| 132 | 
            +
                # Comparison CSV
         | 
| 133 | 
            +
                comp_path = os.path.join(out_dir, f"comparison_{ts}.csv")
         | 
| 134 | 
            +
                compare_df.to_csv(comp_path, index=False)
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                # JSON outputs
         | 
| 137 | 
            +
                json_path = os.path.join(out_dir, f"judgments_{ts}.json")
         | 
| 138 | 
            +
                with open(json_path, 'w') as f:
         | 
| 139 | 
            +
                    json.dump(json_blobs, f, indent=2)
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                # Make ZIP
         | 
| 142 | 
            +
                zip_path = os.path.join(out_dir, f"financeeval_{ts}.zip")
         | 
| 143 | 
            +
                with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
         | 
| 144 | 
            +
                    for pth in csv_paths + [comp_path, json_path]:
         | 
| 145 | 
            +
                        zf.write(pth, arcname=os.path.basename(pth))
         | 
| 146 | 
            +
             | 
| 147 | 
            +
                # Return artifacts
         | 
| 148 | 
            +
                merged_tables = []
         | 
| 149 | 
            +
                for label, df, total in all_tables:
         | 
| 150 | 
            +
                    merged_tables.append(pd.DataFrame({"Metric": [f"— {label} —"], "LLM Score (1-5)": [""], "NLP Subscore (0-1)": [""], "Fused (0-10)": [""], "Comment": [""]}))
         | 
| 151 | 
            +
                    merged_tables.append(df)
         | 
| 152 | 
            +
                merged_df = pd.concat(merged_tables, ignore_index=True)
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                usage_text_all = "\n".join(token_usage_blocks)
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                return merged_df, compare_df, (avg_df if avg_df is not None else pd.DataFrame()), zip_path, usage_text_all
         | 
| 157 | 
            +
             | 
| 158 | 
            +
            # -----------------------------
         | 
| 159 | 
            +
            # UI
         | 
| 160 | 
            +
            # -----------------------------
         | 
| 161 | 
            +
            with gr.Blocks(title="FinanceEval – Hybrid Judge (Gradio)") as demo:
         | 
| 162 | 
            +
                gr.Markdown("""
         | 
| 163 | 
            +
                # 🔎 FinanceEval – Hybrid Evaluation (Gradio / HF Spaces)
         | 
| 164 | 
            +
                Paste a finance conversation. Choose one or both judge models (OpenAI GPT‑4o, Claude 3.5 Sonnet). Adjust metric weights. Click **Evaluate**.
         | 
| 165 | 
            +
                """)
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                with gr.Row():
         | 
| 168 | 
            +
                    conversation = gr.Textbox(label="Conversation", lines=16, placeholder="Paste the full transcript here...")
         | 
| 169 | 
            +
             | 
| 170 | 
            +
                with gr.Accordion("Model Selection", open=True):
         | 
| 171 | 
            +
                    with gr.Row():
         | 
| 172 | 
            +
                        use_openai = gr.Checkbox(value=True, label="Use OpenAI GPT‑4o")
         | 
| 173 | 
            +
                        use_anthropic = gr.Checkbox(value=False, label="Use Claude 3.5 Sonnet")
         | 
| 174 | 
            +
                    with gr.Row():
         | 
| 175 | 
            +
                        model_openai = gr.Textbox(value="gpt-4o", label="OpenAI model name")
         | 
| 176 | 
            +
                        model_anthropic = gr.Textbox(value="claude-3-5-sonnet-20240620", label="Anthropic model name")
         | 
| 177 | 
            +
                    gr.Markdown("**Secrets**: Set `OPENAI_API_KEY` and / or `ANTHROPIC_API_KEY` in your Space settings.")
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                with gr.Accordion("Metric Weights (affect only the TOTAL)", open=True):
         | 
| 180 | 
            +
                    with gr.Row():
         | 
| 181 | 
            +
                        w_trust = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["trust"], step=0.01, label="Trust")
         | 
| 182 | 
            +
                        w_accuracy = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["accuracy"], step=0.01, label="Accuracy")
         | 
| 183 | 
            +
                        w_explain = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["explain"], step=0.01, label="Explainability")
         | 
| 184 | 
            +
                    with gr.Row():
         | 
| 185 | 
            +
                        w_client = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["client_first"], step=0.01, label="Client‑First")
         | 
| 186 | 
            +
                        w_risk = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["risk_safety"], step=0.01, label="Risk Safety")
         | 
| 187 | 
            +
                        w_clarity = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["clarity"], step=0.01, label="Clarity")
         | 
| 188 | 
            +
                    gr.Markdown("Weights are normalized to sum to 1 before computing the TOTAL.")
         | 
| 189 | 
            +
             | 
| 190 | 
            +
                run_btn = gr.Button("Evaluate")
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                with gr.Tab("Per‑Model Results"):
         | 
| 193 | 
            +
                    table_out = gr.Dataframe(label="Metric Scores & Comments (stacked per model)")
         | 
| 194 | 
            +
                with gr.Tab("Comparison"):
         | 
| 195 | 
            +
                    compare_out = gr.Dataframe(label="Model Comparison (per metric + TOTAL)")
         | 
| 196 | 
            +
                    avg_out = gr.Dataframe(label="Average (if multiple models)")
         | 
| 197 | 
            +
                with gr.Tab("Downloads & Usage"):
         | 
| 198 | 
            +
                    zip_file = gr.File(label="Download ZIP (CSVs + JSON)")
         | 
| 199 | 
            +
                    usage_text = gr.Textbox(label="Token Usage", lines=4)
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                run_btn.click(
         | 
| 202 | 
            +
                    fn=run_eval,
         | 
| 203 | 
            +
                    inputs=[conversation, use_openai, use_anthropic,
         | 
| 204 | 
            +
                            w_trust, w_accuracy, w_explain, w_client, w_risk, w_clarity,
         | 
| 205 | 
            +
                            model_openai, model_anthropic],
         | 
| 206 | 
            +
                    outputs=[table_out, compare_out, avg_out, zip_file, usage_text]
         | 
| 207 | 
            +
                )
         | 
| 208 | 
            +
             | 
| 209 | 
            +
            if __name__ == "__main__":
         | 
| 210 | 
            +
                demo.launch()
         | 

