File size: 9,358 Bytes
f9999d7
8b33e5a
cc13501
8b33e5a
4ce4899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ab13d0
8b33e5a
4ce4899
 
6b5f238
4ce4899
6b5f238
4ce4899
6b5f238
 
 
 
 
4ce4899
8b33e5a
6b5f238
4ce4899
 
 
 
 
6b5f238
4ce4899
8b33e5a
4ce4899
cc13501
6b5f238
4ce4899
6b5f238
 
cc13501
6b5f238
7ab13d0
4ce4899
6b5f238
4ce4899
 
7ab13d0
 
 
4ce4899
 
 
6b5f238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ce4899
 
 
 
 
 
 
 
 
 
 
 
cc13501
4ce4899
 
 
 
 
 
 
6b5f238
4ce4899
6b5f238
 
 
 
 
 
4ce4899
f9999d7
4ce4899
 
cc13501
 
4ce4899
 
 
 
 
 
 
 
cc13501
 
 
 
4ce4899
8329e57
cc13501
7ab13d0
f9999d7
4ce4899
 
 
 
 
 
 
 
 
 
 
8329e57
 
 
6b5f238
03fe232
5dc01ff
 
 
 
 
6b5f238
 
4ce4899
 
 
 
 
 
f9999d7
8b33e5a
f9999d7
4ce4899
 
 
 
6b5f238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9999d7
8b33e5a
 
6b5f238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b33e5a
f9999d7
4ce4899
8b33e5a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
import hashlib, tempfile
from config import CSS, DIMS
from OVAL import oval_scores
from DeepEval import deepeval_scores

def make_explanation(system: str, dimension: str, score: float) -> str:
    templates = {
        # OVAL 拓展 5 维
        "Structural Clarity":  f"{system} scored Structural Clarity at {score}: The text structure may be unclear; consider adding headings or breaking into paragraphs.",
        "Reasoning Quality":   f"{system} scored Reasoning Quality at {score}: Argument support is weak; consider adding logical reasoning or evidence.",
        "Factuality":          f"{system} scored Factuality at {score}: Information may be inaccurate; please fact-check the facts.",
        "Depth of Analysis":   f"{system} scored Depth of Analysis at {score}: Analysis seems shallow; add more insights or examples.",
        "Topic Coverage":      f"{system} scored Topic Coverage at {score}: Key aspects may be missing; ensure you cover the full scope.",
        # DeepEval 拓展 5 维
        "Fluency":             f"{system} scored Fluency at {score}: Expression may be disfluent; consider smoothing sentence transitions.",
        "Prompt Relevance":    f"{system} scored Prompt Relevance at {score}: The response may stray from the prompt; ensure alignment.",
        "Conciseness":         f"{system} scored Conciseness at {score}: The response may be verbose; consider trimming redundant parts.",
        "Readability":         f"{system} scored Readability at {score}: The text is hard to read; consider simpler wording or shorter sentences.",
        "Engagement":          f"{system} scored Engagement at {score}: The response lacks engagement; add examples or a conversational tone.",
    }
    return templates.get(dimension, f"{system} scored {dimension} at {score}: Low score detected; please review this aspect.")

def evaluate(
    prompt_text: str,
    output_text: str,
    # Prompt 主观 5 维度
    s1: float, s2: float, s3: float, s4: float, s5: float,
    # Prompt 主观解释
    e1: str, e2: str, e3: str, e4: str, e5: str,
    # Judge 模块
    judge_llm: str,
    ja1: float, ja2: float, ja3: float, ja4: float, ja5: float,
    judge_remark: str,
    # 额外备注
    remark: str
):
    # 1) 验证 Prompt 主观低分必须解释
    for score, exp, label in [
        (s1, e1, "Clarity"),
        (s2, e2, "Scope Definition"),
        (s3, e3, "Intent Alignment"),
        (s4, e4, "Bias / Induction"),
        (s5, e5, "Efficiency"),
    ]:
        if score < 3 and not exp.strip():
            raise gr.Error(f"{label} score < 3: please provide an explanation.")

    # 2) 构造三组分数
    subj = [s1, s2, s3, s4, s5] + [None]*10
    oval = oval_scores(output_text)                   # 15-length list
    deep = deepeval_scores(prompt_text, output_text)  # 15-length list

    # 3) 自动低分解释
    auto_expls = []
    for system, scores, idxs in [
        ("OVAL",     oval, range(5,10)),
        ("DeepEval", deep, range(10,15))
    ]:
        for i in idxs:
            sc = scores[i]
            if sc is not None and sc < 3:
                auto_expls.append(make_explanation(system, DIMS[i], sc))
    auto_text = "\n".join(auto_expls) or "All automated scores ≥ 3; no issues detected."

    # 4) 构建 DataFrame 并导出 CSV(包含 Judge 信息列)
    full_df = pd.DataFrame({
        "Dimension":                   DIMS,
        "Subjective (Prompt)":         subj,
        "OVAL (Output)":               oval,
        "DeepEval (Output)":           deep,
        "Judge LLM":                   [judge_llm] * len(DIMS),
        "Sensory Accuracy":            [ja1] * len(DIMS),
        "Emotional Engagement":        [ja2] * len(DIMS),
        "Flow & Naturalness":          [ja3] * len(DIMS),
        "Imagery Completeness":        [ja4] * len(DIMS),
        "Simplicity & Accessibility":  [ja5] * len(DIMS),
        "Judge Remarks":               [judge_remark] * len(DIMS),
        "Notes (Slang/Tech Terms)":    [remark] * len(DIMS),
    })
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
    full_df.to_csv(tmp.name, index=False)

    # 5) 提取子表
    subj_df = full_df.iloc[0:5][["Dimension","Subjective (Prompt)"]]
    oval_df = full_df.iloc[5:10][["Dimension","OVAL (Output)"]]
    deep_df = full_df.iloc[10:15][["Dimension","DeepEval (Output)"]]

    # 6) 构造雷达图(取三类分数最大值)
    max_scores = [
        max([v for v in vals if v is not None]) if any(v is not None for v in vals) else 0
        for vals in zip(subj, oval, deep)
    ]
    closed_dims = DIMS + [DIMS[0]]
    r = max_scores + [max_scores[0]]
    fig = go.Figure(go.Scatterpolar(r=r, theta=closed_dims, fill='toself'))
    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0,5])),
        showlegend=False,
        title="Final (Max) Scores Radar"
    )

    return (
        subj_df,
        oval_df,
        deep_df,
        fig,
        tmp.name,
        remark,
        # prompt 低分解释
        e1, e2, e3, e4, e5,
        # 自动解释
        auto_text,
        # Judge 输出
        judge_llm,
        ja1, ja2, ja3, ja4, ja5,
        judge_remark
    )

def toggle_explain(v): 
    return gr.update(visible=(v<3))

css = """
#submit-btn {
  background-color: orange !important;
  color: white !important;
  border: none !important;
}
#submit-btn:hover {
  background-color: darkorange !important;
}
"""

with gr.Blocks(css=css) as iface:
    gr.Markdown("# ECHOscore – Prompt vs Output Evaluation")

    prompt_in = gr.Textbox(lines=2, label="Input (Prompt)")
    output_in = gr.Textbox(lines=4, label="Output (Model Response)")

    with gr.Row():
        s1 = gr.Slider(0,5,0,step=0.1, label="Prompt – Clarity")
        s2 = gr.Slider(0,5,0,step=0.1, label="Prompt – Scope Definition")
        s3 = gr.Slider(0,5,0,step=0.1, label="Prompt – Intent Alignment")
        s4 = gr.Slider(0,5,0,step=0.1, label="Prompt – Bias / Induction")
        s5 = gr.Slider(0,5,0,step=0.1, label="Prompt – Efficiency")

    e1 = gr.Textbox(lines=2, label="Explain Clarity (<3)", visible=False)
    e2 = gr.Textbox(lines=2, label="Explain Scope Definition (<3)", visible=False)
    e3 = gr.Textbox(lines=2, label="Explain Intent Alignment (<3)", visible=False)
    e4 = gr.Textbox(lines=2, label="Explain Bias / Induction (<3)", visible=False)
    e5 = gr.Textbox(lines=2, label="Explain Efficiency (<3)", visible=False)
    
    remark = gr.Textbox(lines=2, label="Internet slang & technical terms notes (optional)")
    
    # LLM-as-a-judge 模块(完全可选)
    judge_llm    = gr.Textbox(lines=1, label="LLM-as-a-Judge (optional-the name of LLM)")
    ja1          = gr.Number(label="Sensory Accuracy",            value=0, precision=1, step=0.1)
    ja2          = gr.Number(label="Emotional Engagement",        value=0, precision=1, step=0.1)
    ja3          = gr.Number(label="Flow & Naturalness",          value=0, precision=1, step=0.1)
    ja4          = gr.Number(label="Imagery Completeness",        value=0, precision=1, step=0.1)
    ja5          = gr.Number(label="Simplicity & Accessibility",  value=0, precision=1, step=0.1)
    judge_remark = gr.Textbox(lines=2, label="Judge Remarks (optional)")


    s1.change(toggle_explain, s1, e1)
    s2.change(toggle_explain, s2, e2)
    s3.change(toggle_explain, s3, e3)
    s4.change(toggle_explain, s4, e4)
    s5.change(toggle_explain, s5, e5)

    submit = gr.Button("Submit", elem_id="submit-btn")
    with gr.Row():
        subj_tbl = gr.Dataframe(label="Prompt Subjective Scores")
        oval_tbl = gr.Dataframe(label="OVAL Automated Scores")
        deep_tbl = gr.Dataframe(label="DeepEval Automated Scores")

    radar          = gr.Plot(label="Final Radar Chart")
    csv_out        = gr.File(label="Export CSV")
    notes_out      = gr.Textbox(label="Notes (Slang/Tech Terms)")
    exp1_out       = gr.Textbox(label="Clarity Explanation")
    exp2_out       = gr.Textbox(label="Scope Definition Explanation")
    exp3_out       = gr.Textbox(label="Intent Alignment Explanation")
    exp4_out       = gr.Textbox(label="Bias/Induction Explanation")
    exp5_out       = gr.Textbox(label="Efficiency Explanation")
    auto_out       = gr.Textbox(label="Automatic Explanation")
    judge_llm_out  = gr.Textbox(label="Judge LLM")
    ja1_out        = gr.Number(label="Sensory Accuracy")
    ja2_out        = gr.Number(label="Emotional Engagement")
    ja3_out        = gr.Number(label="Flow & Naturalness")
    ja4_out        = gr.Number(label="Imagery Completeness")
    ja5_out        = gr.Number(label="Simplicity & Accessibility")
    judge_remarks_out = gr.Textbox(label="Judge Remarks")

    submit.click(
        evaluate,
        [
            prompt_in, output_in,
            s1, s2, s3, s4, s5,
            e1, e2, e3, e4, e5,
            judge_llm, ja1, ja2, ja3, ja4, ja5,
            judge_remark, remark
        ],
        [
            subj_tbl, oval_tbl, deep_tbl,
            radar, csv_out, notes_out,
            exp1_out, exp2_out, exp3_out, exp4_out, exp5_out,
            auto_out,
            judge_llm_out, ja1_out, ja2_out, ja3_out, ja4_out, ja5_out,
            judge_remarks_out
        ]
    )

if __name__ == "__main__":
    iface.launch()