Spaces:

Ayanami0730
/

DeepResearch-Leaderboard

Running

App Files Files Community

DeepResearch-Leaderboard / tabs /data_viewer_side_by_side_tab.py

Ayanami0730

Add DeepResearch Bench application with LFS support

927e909 10 days ago

raw

history blame contribute delete

11.1 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Data-Viewer Side-by-Side tab
	"""

	import gradio as gr
	import pandas as pd
	import json, random
	from pathlib import Path
	import re

	# ---------- 路径 ----------
	BASE_DIR = Path(__file__).resolve().parent.parent
	DATA_VIEWER_FILE = BASE_DIR / "data" / "data_viewer.jsonl"

	# ---------- 工具 (与data_viewer_tab.py共享或可复用) ----------
	def load_data_viewer_data() -> pd.DataFrame:
	records = []
	if DATA_VIEWER_FILE.exists():
	for line in DATA_VIEWER_FILE.read_text(encoding="utf-8").splitlines():
	try:
	records.append(json.loads(line))
	except json.JSONDecodeError:
	continue
	df = pd.DataFrame(records)
	req = ["model_name", "id", "prompt", "article", "overall_score",
	"comprehensiveness_score", "insight_score",
	"instruction_following_score", "readability_score"]
	if df.empty or not all(c in df.columns for c in req):
	return pd.DataFrame(columns=req)
	df["id"] = df["id"].astype(str)
	return df

	def make_user_task_markdown(item_id, prompt):
	return f"""### User Task 🎯

	Task ID: {item_id}

	Description: {prompt}"""

	def make_article_markdown(article: str) -> str:
	if article and isinstance(article, str):
	processed_article = re.sub(r'\n{2,}', '\n\n', article)
	table_pattern = r'(\\|[^\n]\n(?:[\\|\s\-:]+\n)?(?:\\|[^\n]\n)*)'
	tables = []
	def replace_table(match):
	tables.append(match.group(1))
	return f'__TABLE_PLACEHOLDER_{len(tables)-1}__'
	processed_article = re.sub(table_pattern, replace_table, processed_article)
	processed_article = re.sub(r'(?<!\n)\\s\\([^]+?)\\:', r'\n\n \1:', processed_article)
	processed_article = re.sub(r'\\s\\([^]+?)\\:\s([^]?)\s\\s\\', r' \1: \2\n * **', processed_article)
	processed_article = re.sub(r'(?<!\n)\[\d+[^]]\]\\s\\', r'\n\n **', processed_article)
	lines = processed_article.split('\n')
	result_lines = []
	for i, line in enumerate(lines):
	result_lines.append(line)
	if (i < len(lines) - 1 and
	line.strip() and
	lines[i + 1].strip() and
	not line.strip().startswith('*') and
	not lines[i + 1].strip().startswith('*') and
	not line.strip().startswith('#')):
	if i + 1 < len(lines) and lines[i + 1].strip():
	result_lines.append('')
	processed_article = '\n'.join(result_lines)
	for i, table in enumerate(tables):
	processed_article = processed_article.replace(f'__TABLE_PLACEHOLDER_{i}__', table)
	else:
	processed_article = article if article is not None else ""
	return f"""### Generated Article 📖

	{processed_article}"""

	def make_scores_html(overall, comprehensiveness, insight, instruction, readability):
	scores_data = [
	("Overall<br>Score", overall),
	("Comprehen-<br>siveness", comprehensiveness),
	("Insight<br>Score", insight),
	("Instruction<br>Following", instruction),
	("Readability<br>Score", readability)
	]
	html_items_str = ""
	for title, score in scores_data:
	score_value = score if score is not None else "N/A"
	html_items_str += f"""
	<div style="text-align: center; padding: 10px 3px; flex-grow: 1; flex-basis: 19%; min-width: 0;">
	<h4 style="margin: 0 0 5px 0; font-size: 1em; color: #4a4a4a; font-weight: 600; line-height: 1.2;">{title}</h4>
	<p style="margin: 0; font-size: 1.1em; font-weight: bold; color: #333;">{score_value}</p>
	</div>
	"""
	return f"""
	<div style="background:#fff; border:1px solid #e0e0e0; border-radius:8px; padding: 15px 10px; margin:18px 0; box-shadow:0 2px 4px rgba(0,0,0,.06);">
	<div style="display: flex; justify-content: space-around; align-items: stretch;">
	{html_items_str}
	</div>
	</div>"""

	# ---------- 生成 Tab ----------
	def create_data_viewer_side_by_side_tab():
	with gr.Tab("⚔️Side-by-Side Viewer"):
	gr.HTML(
	"""<style>
	.card{background:#fff;border:1px solid #e0e0e0;border-radius:8px;padding:22px 24px;margin:18px 0;box-shadow:0 2px 4px rgba(0,0,0,.06);}
	.scrollable-sm{max-height:180px;overflow-y:auto;} /* 稍微减小任务区高度 */
	.scrollable-lg{max-height:550px;overflow-y:auto;} /* 调整文章区高度 */
	.card p{color:#424242 !important;line-height:1.75;margin:0 0 14px 0;text-align:justify;}
	.card ul,.card ol{margin:12px 0 12px 24px;color:#424242 !important;}
	.card li{margin:4px 0;color:#424242 !important;}
	.card blockquote{border-left:4px solid #3498db;margin:18px 0;padding:14px 18px;background:#f8f9fa;font-style:italic;color:#555 !important;}
	.card pre{background:#f8f8f8;color:#333 !important;padding:18px;border-radius:6px;overflow-x:auto;border:1px solid #e0e0e0;}
	.card strong,.card b{font-weight:700 !important;}
	.card::-webkit-scrollbar{width:10px}
	.card::-webkit-scrollbar-track{background:#f5f5f5;border-radius:5px}
	.card::-webkit-scrollbar-thumb{background:#c0c0c0;border-radius:5px}
	.card::-webkit-scrollbar-thumb:hover{background:#a0a0a0}
	</style>"""
	)

	df = load_data_viewer_data()
	if df.empty:
	gr.Markdown("## ⚠️ 没有可用数据 \n请确认 `data/data_viewer.jsonl` 存在且字段齐全(包括所有分数)。")
	return

	all_models = sorted(df["model_name"].unique())
	tasks_df = df[["id", "prompt"]].drop_duplicates().assign(id_num=lambda x: x["id"].astype(int)).sort_values("id_num")
	task_choices = [f"{row['id']}. {row['prompt'][:60] + ('…' if len(row['prompt']) > 60 else '')}" for _, row in tasks_df.iterrows()]

	init_task = random.choice(task_choices) if task_choices else None
	init_model_a = random.choice(all_models) if all_models else None
	init_model_b = random.choice([m for m in all_models if m != init_model_a]) if len(all_models) > 1 else None
	if init_model_b is None and len(all_models) > 0 : init_model_b = all_models[0] # Fallback for single model case

	# --- UI 组件定义 ---
	with gr.Row():
	task_dd = gr.Dropdown(label="Select Task", choices=task_choices, value=init_task, interactive=True)

	user_task_display_md = gr.Markdown(elem_classes=["card", "scrollable-sm"]) # 统一显示任务描述

	with gr.Row():
	with gr.Column(scale=1):
	model_a_dd = gr.Dropdown(label="Select Model A", choices=all_models, value=init_model_a, interactive=True)
	article_a_md = gr.Markdown(elem_classes=["card", "scrollable-lg"])
	scores_a_html = gr.HTML()
	with gr.Column(scale=1):
	model_b_dd = gr.Dropdown(label="Select Model B", choices=all_models, value=init_model_b, interactive=True)
	article_b_md = gr.Markdown(elem_classes=["card", "scrollable-lg"])
	scores_b_html = gr.HTML()

	# --- 回调函数 ---
	def fetch_side_by_side_data(selected_task_display, model_a_name, model_b_name):
	if not selected_task_display:
	no_task_msg = "请选择一个任务。"
	empty_article = make_article_markdown("")
	empty_scores = make_scores_html(None,None,None,None,None)
	return make_user_task_markdown("--", no_task_msg), \
	empty_article, empty_scores, \
	empty_article, empty_scores

	item_id_str = selected_task_display.split(".", 1)[0].strip()
	task_entry = df[df["id"] == item_id_str]
	user_task_md_content = make_user_task_markdown(item_id_str, task_entry["prompt"].iloc[0] if not task_entry.empty else "任务描述未找到。")

	outputs_a = [make_article_markdown("模型A未选择或数据未找到"), make_scores_html(None,None,None,None,None)]
	outputs_b = [make_article_markdown("模型B未选择或数据未找到"), make_scores_html(None,None,None,None,None)]

	if model_a_name:
	entry_a = df[(df["model_name"] == model_a_name) & (df["id"] == item_id_str)]
	if not entry_a.empty:
	outputs_a[0] = make_article_markdown(entry_a["article"].iloc[0])
	outputs_a[1] = make_scores_html(entry_a["overall_score"].iloc[0], entry_a["comprehensiveness_score"].iloc[0],
	entry_a["insight_score"].iloc[0], entry_a["instruction_following_score"].iloc[0],
	entry_a["readability_score"].iloc[0])

	if model_b_name:
	entry_b = df[(df["model_name"] == model_b_name) & (df["id"] == item_id_str)]
	if not entry_b.empty:
	outputs_b[0] = make_article_markdown(entry_b["article"].iloc[0])
	outputs_b[1] = make_scores_html(entry_b["overall_score"].iloc[0], entry_b["comprehensiveness_score"].iloc[0],
	entry_b["insight_score"].iloc[0], entry_b["instruction_following_score"].iloc[0],
	entry_b["readability_score"].iloc[0])

	return user_task_md_content, outputs_a[0], outputs_a[1], outputs_b[0], outputs_b[1]

	# --- 初始加载与事件绑定 ---
	if init_task:
	initial_data = fetch_side_by_side_data(init_task, init_model_a, init_model_b)
	user_task_display_md.value = initial_data[0]
	article_a_md.value = initial_data[1]
	scores_a_html.value = initial_data[2]
	article_b_md.value = initial_data[3]
	scores_b_html.value = initial_data[4]
	else:
	no_task_msg = "请选择一个任务进行比较。"
	user_task_display_md.value = make_user_task_markdown("--", no_task_msg)
	article_a_md.value = make_article_markdown("")
	scores_a_html.value = make_scores_html(None,None,None,None,None)
	article_b_md.value = make_article_markdown("")
	scores_b_html.value = make_scores_html(None,None,None,None,None)

	task_dd.change(fetch_side_by_side_data, inputs=[task_dd, model_a_dd, model_b_dd], outputs=[user_task_display_md, article_a_md, scores_a_html, article_b_md, scores_b_html])
	model_a_dd.change(fetch_side_by_side_data, inputs=[task_dd, model_a_dd, model_b_dd], outputs=[user_task_display_md, article_a_md, scores_a_html, article_b_md, scores_b_html])
	model_b_dd.change(fetch_side_by_side_data, inputs=[task_dd, model_a_dd, model_b_dd], outputs=[user_task_display_md, article_a_md, scores_a_html, article_b_md, scores_b_html])