Spaces:
Running
Running
File size: 9,358 Bytes
f9999d7 8b33e5a cc13501 8b33e5a 4ce4899 7ab13d0 8b33e5a 4ce4899 6b5f238 4ce4899 6b5f238 4ce4899 6b5f238 4ce4899 8b33e5a 6b5f238 4ce4899 6b5f238 4ce4899 8b33e5a 4ce4899 cc13501 6b5f238 4ce4899 6b5f238 cc13501 6b5f238 7ab13d0 4ce4899 6b5f238 4ce4899 7ab13d0 4ce4899 6b5f238 4ce4899 cc13501 4ce4899 6b5f238 4ce4899 6b5f238 4ce4899 f9999d7 4ce4899 cc13501 4ce4899 cc13501 4ce4899 8329e57 cc13501 7ab13d0 f9999d7 4ce4899 8329e57 6b5f238 03fe232 5dc01ff 6b5f238 4ce4899 f9999d7 8b33e5a f9999d7 4ce4899 6b5f238 f9999d7 8b33e5a 6b5f238 8b33e5a f9999d7 4ce4899 8b33e5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
import hashlib, tempfile
from config import CSS, DIMS
from OVAL import oval_scores
from DeepEval import deepeval_scores
def make_explanation(system: str, dimension: str, score: float) -> str:
templates = {
# OVAL 拓展 5 维
"Structural Clarity": f"{system} scored Structural Clarity at {score}: The text structure may be unclear; consider adding headings or breaking into paragraphs.",
"Reasoning Quality": f"{system} scored Reasoning Quality at {score}: Argument support is weak; consider adding logical reasoning or evidence.",
"Factuality": f"{system} scored Factuality at {score}: Information may be inaccurate; please fact-check the facts.",
"Depth of Analysis": f"{system} scored Depth of Analysis at {score}: Analysis seems shallow; add more insights or examples.",
"Topic Coverage": f"{system} scored Topic Coverage at {score}: Key aspects may be missing; ensure you cover the full scope.",
# DeepEval 拓展 5 维
"Fluency": f"{system} scored Fluency at {score}: Expression may be disfluent; consider smoothing sentence transitions.",
"Prompt Relevance": f"{system} scored Prompt Relevance at {score}: The response may stray from the prompt; ensure alignment.",
"Conciseness": f"{system} scored Conciseness at {score}: The response may be verbose; consider trimming redundant parts.",
"Readability": f"{system} scored Readability at {score}: The text is hard to read; consider simpler wording or shorter sentences.",
"Engagement": f"{system} scored Engagement at {score}: The response lacks engagement; add examples or a conversational tone.",
}
return templates.get(dimension, f"{system} scored {dimension} at {score}: Low score detected; please review this aspect.")
def evaluate(
prompt_text: str,
output_text: str,
# Prompt 主观 5 维度
s1: float, s2: float, s3: float, s4: float, s5: float,
# Prompt 主观解释
e1: str, e2: str, e3: str, e4: str, e5: str,
# Judge 模块
judge_llm: str,
ja1: float, ja2: float, ja3: float, ja4: float, ja5: float,
judge_remark: str,
# 额外备注
remark: str
):
# 1) 验证 Prompt 主观低分必须解释
for score, exp, label in [
(s1, e1, "Clarity"),
(s2, e2, "Scope Definition"),
(s3, e3, "Intent Alignment"),
(s4, e4, "Bias / Induction"),
(s5, e5, "Efficiency"),
]:
if score < 3 and not exp.strip():
raise gr.Error(f"{label} score < 3: please provide an explanation.")
# 2) 构造三组分数
subj = [s1, s2, s3, s4, s5] + [None]*10
oval = oval_scores(output_text) # 15-length list
deep = deepeval_scores(prompt_text, output_text) # 15-length list
# 3) 自动低分解释
auto_expls = []
for system, scores, idxs in [
("OVAL", oval, range(5,10)),
("DeepEval", deep, range(10,15))
]:
for i in idxs:
sc = scores[i]
if sc is not None and sc < 3:
auto_expls.append(make_explanation(system, DIMS[i], sc))
auto_text = "\n".join(auto_expls) or "All automated scores ≥ 3; no issues detected."
# 4) 构建 DataFrame 并导出 CSV(包含 Judge 信息列)
full_df = pd.DataFrame({
"Dimension": DIMS,
"Subjective (Prompt)": subj,
"OVAL (Output)": oval,
"DeepEval (Output)": deep,
"Judge LLM": [judge_llm] * len(DIMS),
"Sensory Accuracy": [ja1] * len(DIMS),
"Emotional Engagement": [ja2] * len(DIMS),
"Flow & Naturalness": [ja3] * len(DIMS),
"Imagery Completeness": [ja4] * len(DIMS),
"Simplicity & Accessibility": [ja5] * len(DIMS),
"Judge Remarks": [judge_remark] * len(DIMS),
"Notes (Slang/Tech Terms)": [remark] * len(DIMS),
})
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
full_df.to_csv(tmp.name, index=False)
# 5) 提取子表
subj_df = full_df.iloc[0:5][["Dimension","Subjective (Prompt)"]]
oval_df = full_df.iloc[5:10][["Dimension","OVAL (Output)"]]
deep_df = full_df.iloc[10:15][["Dimension","DeepEval (Output)"]]
# 6) 构造雷达图(取三类分数最大值)
max_scores = [
max([v for v in vals if v is not None]) if any(v is not None for v in vals) else 0
for vals in zip(subj, oval, deep)
]
closed_dims = DIMS + [DIMS[0]]
r = max_scores + [max_scores[0]]
fig = go.Figure(go.Scatterpolar(r=r, theta=closed_dims, fill='toself'))
fig.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0,5])),
showlegend=False,
title="Final (Max) Scores Radar"
)
return (
subj_df,
oval_df,
deep_df,
fig,
tmp.name,
remark,
# prompt 低分解释
e1, e2, e3, e4, e5,
# 自动解释
auto_text,
# Judge 输出
judge_llm,
ja1, ja2, ja3, ja4, ja5,
judge_remark
)
def toggle_explain(v):
return gr.update(visible=(v<3))
css = """
#submit-btn {
background-color: orange !important;
color: white !important;
border: none !important;
}
#submit-btn:hover {
background-color: darkorange !important;
}
"""
with gr.Blocks(css=css) as iface:
gr.Markdown("# ECHOscore – Prompt vs Output Evaluation")
prompt_in = gr.Textbox(lines=2, label="Input (Prompt)")
output_in = gr.Textbox(lines=4, label="Output (Model Response)")
with gr.Row():
s1 = gr.Slider(0,5,0,step=0.1, label="Prompt – Clarity")
s2 = gr.Slider(0,5,0,step=0.1, label="Prompt – Scope Definition")
s3 = gr.Slider(0,5,0,step=0.1, label="Prompt – Intent Alignment")
s4 = gr.Slider(0,5,0,step=0.1, label="Prompt – Bias / Induction")
s5 = gr.Slider(0,5,0,step=0.1, label="Prompt – Efficiency")
e1 = gr.Textbox(lines=2, label="Explain Clarity (<3)", visible=False)
e2 = gr.Textbox(lines=2, label="Explain Scope Definition (<3)", visible=False)
e3 = gr.Textbox(lines=2, label="Explain Intent Alignment (<3)", visible=False)
e4 = gr.Textbox(lines=2, label="Explain Bias / Induction (<3)", visible=False)
e5 = gr.Textbox(lines=2, label="Explain Efficiency (<3)", visible=False)
remark = gr.Textbox(lines=2, label="Internet slang & technical terms notes (optional)")
# LLM-as-a-judge 模块(完全可选)
judge_llm = gr.Textbox(lines=1, label="LLM-as-a-Judge (optional-the name of LLM)")
ja1 = gr.Number(label="Sensory Accuracy", value=0, precision=1, step=0.1)
ja2 = gr.Number(label="Emotional Engagement", value=0, precision=1, step=0.1)
ja3 = gr.Number(label="Flow & Naturalness", value=0, precision=1, step=0.1)
ja4 = gr.Number(label="Imagery Completeness", value=0, precision=1, step=0.1)
ja5 = gr.Number(label="Simplicity & Accessibility", value=0, precision=1, step=0.1)
judge_remark = gr.Textbox(lines=2, label="Judge Remarks (optional)")
s1.change(toggle_explain, s1, e1)
s2.change(toggle_explain, s2, e2)
s3.change(toggle_explain, s3, e3)
s4.change(toggle_explain, s4, e4)
s5.change(toggle_explain, s5, e5)
submit = gr.Button("Submit", elem_id="submit-btn")
with gr.Row():
subj_tbl = gr.Dataframe(label="Prompt Subjective Scores")
oval_tbl = gr.Dataframe(label="OVAL Automated Scores")
deep_tbl = gr.Dataframe(label="DeepEval Automated Scores")
radar = gr.Plot(label="Final Radar Chart")
csv_out = gr.File(label="Export CSV")
notes_out = gr.Textbox(label="Notes (Slang/Tech Terms)")
exp1_out = gr.Textbox(label="Clarity Explanation")
exp2_out = gr.Textbox(label="Scope Definition Explanation")
exp3_out = gr.Textbox(label="Intent Alignment Explanation")
exp4_out = gr.Textbox(label="Bias/Induction Explanation")
exp5_out = gr.Textbox(label="Efficiency Explanation")
auto_out = gr.Textbox(label="Automatic Explanation")
judge_llm_out = gr.Textbox(label="Judge LLM")
ja1_out = gr.Number(label="Sensory Accuracy")
ja2_out = gr.Number(label="Emotional Engagement")
ja3_out = gr.Number(label="Flow & Naturalness")
ja4_out = gr.Number(label="Imagery Completeness")
ja5_out = gr.Number(label="Simplicity & Accessibility")
judge_remarks_out = gr.Textbox(label="Judge Remarks")
submit.click(
evaluate,
[
prompt_in, output_in,
s1, s2, s3, s4, s5,
e1, e2, e3, e4, e5,
judge_llm, ja1, ja2, ja3, ja4, ja5,
judge_remark, remark
],
[
subj_tbl, oval_tbl, deep_tbl,
radar, csv_out, notes_out,
exp1_out, exp2_out, exp3_out, exp4_out, exp5_out,
auto_out,
judge_llm_out, ja1_out, ja2_out, ja3_out, ja4_out, ja5_out,
judge_remarks_out
]
)
if __name__ == "__main__":
iface.launch() |