Spaces:

EugeneXiang
/

ECHOscore

Running

App Files Files Community

ECHOscore / app.py

EugeneXiang

Update app.py

8ade384 verified about 1 month ago

raw

history blame contribute delete

20.6 kB

	import gradio as gr
	import pandas as pd
	import plotly.graph_objects as go
	import hashlib, tempfile, os, time
	from datetime import datetime, timezone
	import sqlite3
	import random

	# 假设这些模块在其他地方定义
	from config import CSS, DIMS
	from OVAL import oval_scores
	from DeepEval import deepeval_scores

	# 全局配置
	DAILY_LIMIT = 150 # 每日全局限制次数
	REQUEST_INTERVAL = 9 # 请求间隔(秒)
	DB_FILE = "usage_tracker.db" # SQLite数据库文件名

	def init_db():
	"""初始化SQLite数据库"""
	conn = sqlite3.connect(DB_FILE)
	c = conn.cursor()

	# 创建全局计数器表
	c.execute('''
	CREATE TABLE IF NOT EXISTS global_stats (
	id INTEGER PRIMARY KEY,
	date TEXT NOT NULL,
	count INTEGER NOT NULL,
	last_request REAL NOT NULL
	)
	''')

	# 确保只有一条记录
	c.execute("SELECT COUNT(*) FROM global_stats")
	count = c.fetchone()[0]
	if count == 0:
	c.execute("INSERT INTO global_stats (date, count, last_request) VALUES (?, ?, ?)",
	(get_utc_date(), 0, time.time()))

	conn.commit()
	conn.close()

	def get_utc_date():
	"""获取UTC+0的日期字符串"""
	return datetime.now(timezone.utc).strftime("%Y-%m-%d")

	def check_daily_limit():
	"""检查今日全局请求次数是否超限"""
	today = get_utc_date()
	conn = sqlite3.connect(DB_FILE)
	c = conn.cursor()

	c.execute("SELECT date, count, last_request FROM global_stats WHERE id = 1")
	row = c.fetchone()

	if not row:
	# 如果记录不存在，初始化
	c.execute("INSERT INTO global_stats (date, count, last_request) VALUES (?, ?, ?)",
	(today, 0, time.time()))
	count = 0
	else:
	db_date, count, last_request = row

	# 如果是新的一天，重置计数
	if db_date != today:
	c.execute("UPDATE global_stats SET date = ?, count = ?, last_request = ? WHERE id = 1",
	(today, 0, time.time()))
	count = 0

	conn.commit()
	conn.close()

	return count >= DAILY_LIMIT, count

	def update_request_count():
	"""更新全局请求计数"""
	today = get_utc_date()
	current_time = time.time()

	conn = sqlite3.connect(DB_FILE)
	c = conn.cursor()

	c.execute("SELECT date, count FROM global_stats WHERE id = 1")
	row = c.fetchone()

	if not row:
	# 如果记录不存在，初始化
	c.execute("INSERT INTO global_stats (date, count, last_request) VALUES (?, ?, ?)",
	(today, 1, current_time))
	count = 1
	else:
	db_date, count = row

	# 如果是新的一天，重置计数
	if db_date != today:
	c.execute("UPDATE global_stats SET date = ?, count = 1, last_request = ? WHERE id = 1",
	(today, current_time))
	count = 1
	else:
	# 增加计数
	c.execute("UPDATE global_stats SET count = count + 1, last_request = ? WHERE id = 1",
	(current_time,))
	count += 1

	conn.commit()
	conn.close()

	return count, current_time

	def check_request_interval():
	"""检查请求间隔是否满足要求"""
	conn = sqlite3.connect(DB_FILE)
	c = conn.cursor()

	c.execute("SELECT last_request FROM global_stats WHERE id = 1")
	row = c.fetchone()

	if not row:
	return True # 如果记录不存在，允许请求

	last_time = row[0]
	conn.close()

	return time.time() - last_time >= REQUEST_INTERVAL

	def generate_captcha():
	"""生成随机加法验证码"""
	num1 = random.randint(2, 8)
	num2 = random.randint(2, 8)
	return f"What's {num1} + {num2}?", num1 + num2

	def make_explanation(system: str, dimension: str, score: float) -> str:
	templates = {
	# OVAL 拓展 5 维
	"Structural Clarity": f"{system} scored Structural Clarity at {score}: The text structure may be unclear; consider adding headings or breaking into paragraphs.",
	"Reasoning Quality": f"{system} scored Reasoning Quality at {score}: Argument support is weak; consider adding logical reasoning or evidence.",
	"Factuality": f"{system} scored Factuality at {score}: Information may be inaccurate; please fact-check the facts.",
	"Depth of Analysis": f"{system} scored Depth of Analysis at {score}: Analysis seems shallow; add more insights or examples.",
	"Topic Coverage": f"{system} scored Topic Coverage at {score}: Key aspects may be missing; ensure you cover the full scope.",
	# DeepEval 拓展 5 维
	"Fluency": f"{system} scored Fluency at {score}: Expression may be disfluent; consider smoothing sentence transitions.",
	"Prompt Relevance": f"{system} scored Prompt Relevance at {score}: The response may stray from the prompt; ensure alignment.",
	"Conciseness": f"{system} scored Conciseness at {score}: The response may be verbose; consider trimming redundant parts.",
	"Readability": f"{system} scored Readability at {score}: The text is hard to read; consider simpler wording or shorter sentences.",
	"Engagement": f"{system} scored Engagement at {score}: The response lacks engagement; add examples or a conversational tone.",
	}
	return templates.get(dimension, f"{system} scored {dimension} at {score}: Low score detected; please review this aspect.")

	def evaluate(
	prompt_text: str,
	output_text: str,
	# Prompt 主观 5 维度
	s1: float, s2: float, s3: float, s4: float, s5: float,
	# Prompt 主观解释
	e1: str, e2: str, e3: str, e4: str, e5: str,
	# Judge 模块
	judge_llm: str,
	ja1: float, ja2: float, ja3: float, ja4: float, ja5: float,
	judge_remark: str,
	# 额外备注
	remark: str,
	# 验证码
	captcha_answer: str,
	correct_answer: int,
	# 会话状态
	session_state: dict
	):
	# 1) 验证全局请求状态
	is_limited, current_count = check_daily_limit()

	# 检查是否达到每日限制
	if is_limited:
	return (
	gr.update(visible=True), # 显示限制提示
	gr.update(visible=False), # 隐藏结果区域
	None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, current_count, None, None
	)

	# 检查请求间隔
	if not check_request_interval():
	with sqlite3.connect(DB_FILE) as conn:
	c = conn.cursor()
	c.execute("SELECT last_request FROM global_stats WHERE id = 1")
	last_time = c.fetchone()[0]
	remaining_time = REQUEST_INTERVAL - (time.time() - last_time)
	raise gr.Error(f"请等待 {remaining_time:.1f} 秒后再试")

	# 检查验证码
	try:
	if int(captcha_answer) != correct_answer:
	raise gr.Error("Verification code error, please try again")
	except (ValueError, TypeError):
	raise gr.Error("Please enter the correct verification code")

	# 2) 更新全局请求计数
	count, last_request = update_request_count()

	# 3) 验证 Prompt 主观低分必须解释
	for score, exp, label in [
	(s1, e1, "Clarity"),
	(s2, e2, "Scope Definition"),
	(s3, e3, "Intent Alignment"),
	(s4, e4, "Bias / Induction"),
	(s5, e5, "Efficiency"),
	]:
	if score < 3 and not exp.strip():
	raise gr.Error(f"{label} score < 3: please provide an explanation.")

	# 4) 构造三组分数
	subj = [s1, s2, s3, s4, s5] + [None]*10

	# 获取完整的OVAL和DeepEval分数
	full_oval = oval_scores(output_text)
	full_deep = deepeval_scores(prompt_text, output_text)

	# 灰化指定的维度（将对应分数设为None）
	# OVAL的Factuality(索引7)和Topic Coverage(索引9)
	full_oval[7] = None # Factuality
	full_oval[9] = None # Topic Coverage

	# DeepEval的Prompt Relevance(索引11)及Conciseness(索引12)
	full_deep[11] = None # Prompt Relevance
	full_deep[12] = None # Conciseness

	# 使用处理后的分数
	oval = full_oval
	deep = full_deep

	# 5) 自动低分解释
	auto_expls = []
	for system, scores, idxs in [
	("OVAL", oval, range(5,10)),
	("DeepEval", deep, range(10,15))
	]:
	for i in idxs:
	sc = scores[i]
	if sc is not None and sc < 3:
	auto_expls.append(make_explanation(system, DIMS[i], sc))
	auto_text = "\n".join(auto_expls) or "All automated scores ≥ 3; no issues detected."

	# 6) 构建 DataFrame（包含 Judge 信息列）
	full_df = pd.DataFrame({
	"Dimension": DIMS,
	"Subjective (Prompt)": subj,
	"OVAL (Output)": oval,
	"DeepEval (Output)": deep,
	"Judge LLM": [judge_llm] * len(DIMS),
	"Sensory Accuracy": [ja1] * len(DIMS),
	"Emotional Engagement": [ja2] * len(DIMS),
	"Flow & Naturalness": [ja3] * len(DIMS),
	"Imagery Completeness": [ja4] * len(DIMS),
	"Simplicity & Accessibility": [ja5] * len(DIMS),
	"Judge Remarks": [judge_remark] * len(DIMS),
	"Notes (Slang/Tech Terms)": [remark] * len(DIMS),
	})

	# 7) 提取子表
	subj_df = full_df.iloc[0:5][["Dimension","Subjective (Prompt)"]]
	oval_df = full_df.iloc[5:10][["Dimension","OVAL (Output)"]]
	deep_df = full_df.iloc[10:15][["Dimension","DeepEval (Output)"]]

	# 8) 构造雷达图（取三类分数最大值）
	max_scores = [
	max([v for v in vals if v is not None]) if any(v is not None for v in vals) else 0
	for vals in zip(subj, oval, deep)
	]
	closed_dims = DIMS + [DIMS[0]]
	r = max_scores + [max_scores[0]]
	fig = go.Figure(go.Scatterpolar(r=r, theta=closed_dims, fill='toself'))
	fig.update_layout(
	polar=dict(radialaxis=dict(visible=True, range=[0,5])),
	showlegend=False,
	title="Final (Max) Scores Radar"
	)

	# 更新页面底部的计数器显示
	return (
	gr.update(visible=False), # 隐藏限制提示
	gr.update(visible=True), # 显示结果区域
	subj_df,
	oval_df,
	deep_df,
	fig,
	None, # 不生成CSV文件
	remark,
	e1, e2, e3, e4, e5,
	auto_text,
	judge_llm,
	ja1, ja2, ja3, ja4, ja5,
	judge_remark,
	*generate_captcha(), # 生成新的验证码
	count, # 返回当前全局计数
	"expanded", # Judge区块默认展开
	gr.update(value=f"Today Counts: {count}/{DAILY_LIMIT}") # 更新底部计数器
	)

	def toggle_explain(v):
	return gr.update(visible=(v<3))

	def check_daily_limit_state():
	"""检查全局状态并更新UI显示"""
	is_limited, current_count = check_daily_limit()

	return (
	gr.update(visible=is_limited), # 限制提示
	gr.update(visible=not is_limited), # 启用提交按钮
	gr.update(visible=not is_limited), # 显示结果区域
	f"Today Counts: {current_count}/{DAILY_LIMIT}", # 更新计数器文本
	gr.update(value=f"Today Counts: {current_count}/{DAILY_LIMIT}") # 更新底部计数器
	)

	def show_personal_version_notice():
	"""显示个人版本提示"""
	raise gr.Error("Only for coming personal version.")

	def toggle_judge_section(visible):
	"""切换Judge部分的显示状态"""
	return gr.update(visible=(visible == "expanded")), gr.update(value=("Collapse" if visible == "expanded" else "Expand"))

	css = """
	#submit-btn {
	background-color: orange !important;
	color: white !important;
	border: none !important;
	}
	#submit-btn:hover {
	background-color: darkorange !important;
	}
	.limit-notice {
	background-color: #ffcccc;
	border: 1px solid #ff6666;
	padding: 10px;
	border-radius: 5px;
	margin: 10px 0;
	}
	.upgrade-notice {
	background-color: #e6f7ff;
	border: 1px solid #91d5ff;
	padding: 10px;
	border-radius: 5px;
	margin: 10px 0;
	}
	.welcome-notice {
	background-color: #fff7e6;
	border: 1px solid #ffd591;
	padding: 10px;
	border-radius: 5px;
	margin: 10px 0;
	}
	.disabled-dimension {
	color: #888;
	font-style: italic;
	}
	.example-label {
	font-weight: bold;
	color: #666;
	margin-top: 10px;
	}
	.daily-count {
	font-size: 16px;
	font-weight: bold;
	margin-top: 15px;
	text-align: center;
	}
	.judge-section {
	border: 1px solid #ddd;
	border-radius: 5px;
	margin-top: 10px;
	}
	.judge-header {
	cursor: pointer;
	padding: 10px;
	background-color: #f5f5f5;
	display: flex;
	justify-content: space-between;
	align-items: center;
	}
	.judge-content {
	padding: 10px;
	}
	"""

	# 初始化数据库
	init_db()

	with gr.Blocks(css=css) as iface:
	# 会话状态
	session_state = gr.State({})
	judge_section_state = gr.State("expanded") # 初始为展开状态

	# 顶部欢迎语和限制说明
	gr.Markdown("""
	<div class="welcome-notice">
	<h3>👋 Hey there! You're using the ECHOscore demo.</h3>
	<p>It's a lighter version with limited features.</p>
	<p>For the full power, grab the desktop version(coming soon)!</p>
	</div>
	""")

	# 每日限制提示（初始隐藏）
	limit_notice = gr.Markdown("""
	<div class="limit-notice">
	<h3>⚠️ Oops! Daily limit reached.</h3>
	<p>Tomorrow’s a new day — or skip the wait with desktop version (coming soon)!</p>
	</div>
	""", visible=False)

	gr.Markdown("# ECHOscore – Prompt vs Output Evaluation")

	# 当前使用情况
	daily_count = gr.Textbox(label="Daily Counts", value="Today Counts: 0/150", interactive=False, visible=False)

	with gr.Row():
	prompt_in = gr.Textbox(lines=4, label="Input (Prompt)")
	output_in = gr.Textbox(lines=4, label="Output (Model Response)")

	# verification code
	captcha_text = gr.Textbox(label="verification code", interactive=False)
	captcha_answer = gr.Textbox(label="Please enter the calculation result", placeholder="Verification code answer")
	correct_answer = gr.State(8) # 初始值，会在页面加载时更新

	with gr.Row():
	s1 = gr.Slider(0,5,0,step=0.1, label="Prompt – Clarity")
	s2 = gr.Slider(0,5,0,step=0.1, label="Prompt – Scope Definition")
	s3 = gr.Slider(0,5,0,step=0.1, label="Prompt – Intent Alignment")
	s4 = gr.Slider(0,5,0,step=0.1, label="Prompt – Bias / Induction")
	s5 = gr.Slider(0,5,0,step=0.1, label="Prompt – Efficiency")

	e1 = gr.Textbox(lines=2, label="Explain Clarity (<3)", visible=False)
	e2 = gr.Textbox(lines=2, label="Explain Scope Definition (<3)", visible=False)
	e3 = gr.Textbox(lines=2, label="Explain Intent Alignment (<3)", visible=False)
	e4 = gr.Textbox(lines=2, label="Explain Bias / Induction (<3)", visible=False)
	e5 = gr.Textbox(lines=2, label="Explain Efficiency (<3)", visible=False)

	remark = gr.Textbox(lines=2, label="Internet slang & technical terms notes (optional)")

	# Judge模块 - 可折叠/展开
	with gr.Row():
	with gr.Column(scale=12):
	judge_header = gr.Markdown("""
	<div class="judge-header">
	<span>LLM-as-a-Judge (optional)</span>
	</div>
	""")
	with gr.Column(scale=1, visible=False):
	toggle_judge_btn = gr.Button("Collapse", visible=False)

	with gr.Row(visible=True) as judge_section:
	judge_llm = gr.Textbox(lines=1, label="LLM-as-a-Judge (optional-Place the NAME of LLM)")
	gr.Markdown("LLM Scoring Examples", elem_classes="example-label")
	ja1 = gr.Number(label="Sensory Accuracy (only for desktop version)", value=0, precision=1, step=0.1, interactive=False)
	ja2 = gr.Number(label="Emotional Engagement (only for desktop version)", value=0, precision=1, step=0.1, interactive=False)
	ja3 = gr.Number(label="Flow & Naturalness (only for desktop version)", value=0, precision=1, step=0.1, interactive=False)
	ja4 = gr.Number(label="Imagery Completeness (only for desktop version)", value=0, precision=1, step=0.1, interactive=False)
	ja5 = gr.Number(label="Simplicity & Accessibility (only for desktop version)", value=0, precision=1, step=0.1, interactive=False)
	judge_remark = gr.Textbox(lines=2, label="Judge Remarks (only for desktop version)", interactive=True)

	# 升级提示
	gr.Markdown("""
	<div class="upgrade-notice">
	<h3>🔝 Unlock Full Features</h3>
	<p>Get access to all dimensions and unlimited evaluations.</p>
	<a href="https://www.echoscore.dev" target="_blank">Learn more about ECHOscore</a>
	</div>
	""")

	s1.change(toggle_explain, s1, e1)
	s2.change(toggle_explain, s2, e2)
	s3.change(toggle_explain, s3, e3)
	s4.change(toggle_explain, s4, e4)
	s5.change(toggle_explain, s5, e5)

	# 结果区域（初始隐藏）
	with gr.Row(visible=False) as results_area:
	subj_tbl = gr.Dataframe(label="Prompt Subjective Scores")
	oval_tbl = gr.Dataframe(label="OVAL Automated Scores")
	deep_tbl = gr.Dataframe(label="DeepEval Automated Scores")

	radar = gr.Plot(label="Final Radar Chart")
	csv_out = gr.File(label="Export CSV")
	notes_out = gr.Textbox(label="Notes (Slang/Tech Terms)")
	exp1_out = gr.Textbox(label="Clarity Explanation")
	exp2_out = gr.Textbox(label="Scope Definition Explanation")
	exp3_out = gr.Textbox(label="Intent Alignment Explanation")
	exp4_out = gr.Textbox(label="Bias/Induction Explanation")
	exp5_out = gr.Textbox(label="Efficiency Explanation")
	auto_out = gr.Textbox(label="Automatic Explanation")
	judge_llm_out = gr.Textbox(label="LLM-as-a-Judge")
	ja1_out = gr.Number(label="Sensory Accuracy",visible=False)
	ja2_out = gr.Number(label="Emotional Engagement",visible=False)
	ja3_out = gr.Number(label="Flow & Naturalness",visible=False)
	ja4_out = gr.Number(label="Imagery Completeness",visible=False)
	ja5_out = gr.Number(label="Simplicity & Accessibility",visible=False)
	judge_remarks_out = gr.Textbox(label="Judge Remarks")

	submit = gr.Button("Submit", elem_id="submit-btn")

	# 新增：创建一个用于显示底部计数器的组件
	footer_count = gr.Textbox(label="Today's Usage", value="Today Counts: 0/150", interactive=False, visible=True)

	gr.Markdown("""
	<div>
	⚠️ This is a demo version of ECHOscore.
	Data contribution, uploads, and edits are not supported.
	To try the full version, please download the desktop release.
	</div>
	""")

	# 初始化检查
	iface.load(
	check_daily_limit_state,
	None,
	[limit_notice, submit, results_area, daily_count, footer_count] # 添加footer_count
	)

	iface.load(
	lambda: generate_captcha(),
	None,
	[captcha_text, correct_answer]
	)

	submit.click(
	evaluate,
	[
	prompt_in, output_in,
	s1, s2, s3, s4, s5,
	e1, e2, e3, e4, e5,
	judge_llm, ja1, ja2, ja3, ja4, ja5,
	judge_remark, remark,
	captcha_answer, correct_answer,
	session_state
	],
	[
	limit_notice, results_area,
	subj_tbl, oval_tbl, deep_tbl,
	radar, csv_out, notes_out,
	exp1_out, exp2_out, exp3_out, exp4_out, exp5_out,
	auto_out,
	judge_llm_out, ja1_out, ja2_out, ja3_out, ja4_out, ja5_out,
	judge_remarks_out,
	captcha_text, correct_answer,
	daily_count,
	judge_section_state, # 更新Judge区块状态
	footer_count # 更新底部计数器
	]
	)

	# 点击CSV下载按钮时显示提示
	csv_out.download(show_personal_version_notice)

	# 切换Judge部分的显示状态
	toggle_judge_btn.click(
	lambda x: ("expanded" if x == "collapsed" else "collapsed"),
	judge_section_state,
	judge_section_state
	).then(
	toggle_judge_section,
	judge_section_state,
	[judge_section, toggle_judge_btn]
	)

	if __name__ == "__main__":
	iface.launch()