Spaces:

huggingface-KREW
/

Ko-AgentBench

Running on CPU Upgrade

Ko-AgentBench / visualization.py

Harheem Kim

init

e27700b 9 days ago

7.41 kB

	from utils import get_chart_colors
	import matplotlib
	import matplotlib.pyplot as plt
	import numpy as np
	import plotly.graph_objects as go


	def setup_matplotlib():
	matplotlib.use("Agg")
	plt.close("all")


	def get_performance_chart(df, category_name="Overall"):
	plt.close("all")
	colors = get_chart_colors()
	score_column = "Category Score"
	df_sorted = df.sort_values(score_column, ascending=True)

	height = max(8, len(df_sorted) * 0.8)
	fig, ax = plt.subplots(figsize=(16, height))
	plt.rcParams.update({"font.size": 12})

	fig.patch.set_facecolor(colors["background"])
	ax.set_facecolor(colors["background"])

	try:
	bars = ax.barh(
	np.arange(len(df_sorted)),
	df_sorted[score_column],
	height=0.4,
	capstyle="round",
	color=[colors[t] for t in df_sorted["Model Type"]],
	)

	ax.set_title(
	f"Model Performance - {category_name}",
	pad=20,
	fontsize=20,
	fontweight="bold",
	color=colors["text"],
	)
	ax.set_xlabel(
	"Average Score (Tool Selection Quality)",
	fontsize=14,
	fontweight="bold",
	labelpad=10,
	color=colors["text"],
	)
	ax.set_xlim(0.0, 1.0)

	ax.set_yticks(np.arange(len(df_sorted)))
	ax.set_yticklabels(
	df_sorted["Model"], fontsize=12, fontweight="bold", color=colors["text"]
	)

	plt.subplots_adjust(left=0.35)

	for i, v in enumerate(df_sorted[score_column]):
	ax.text(
	v + 0.01,
	i,
	f"{v:.3f}",
	va="center",
	fontsize=12,
	fontweight="bold",
	color=colors["text"],
	)

	ax.grid(True, axis="x", linestyle="--", alpha=0.2, color=colors["grid"])
	ax.spines[["top", "right"]].set_visible(False)
	ax.spines[["bottom", "left"]].set_color(colors["grid"])
	ax.tick_params(colors=colors["text"])

	legend_elements = [
	plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
	for label, color in {
	k: colors[k] for k in ["Private", "Open source"]
	}.items()
	]
	ax.legend(
	handles=legend_elements,
	title="Model Type",
	loc="lower right",
	fontsize=12,
	title_fontsize=14,
	facecolor=colors["background"],
	labelcolor=colors["text"],
	)

	plt.tight_layout()
	return fig
	finally:
	plt.close(fig)

	def create_radar_plot(df, model_names):
	datasets = [col for col in df.columns[7:] if col != "IO Cost"]
	fig = go.Figure()

	colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
	line_colors = ["#4F46E5", "#16A34A"]

	for idx, model_name in enumerate(model_names):
	model_data = df[df["Model"] == model_name].iloc[0]
	values = [model_data[m] for m in datasets]
	values.append(values[0])
	datasets_plot = datasets + [datasets[0]]

	fig.add_trace(
	go.Scatterpolar(
	r=values,
	theta=datasets_plot,
	fill="toself",
	fillcolor=colors[idx % len(colors)],
	line=dict(color=line_colors[idx % len(line_colors)], width=2),
	name=model_name,
	text=[f"{val:.3f}" for val in values],
	textposition="middle right",
	mode="lines+markers+text",
	)
	)

	fig.update_layout(
	polar=dict(
	radialaxis=dict(
	visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
	),
	angularaxis=dict(
	tickfont=dict(size=13, family="Arial"),
	rotation=90,
	direction="clockwise",
	),
	domain=dict(x=[0.1, 0.9], y=[0.1, 0.9])
	),
	showlegend=True,
	legend=dict(
	orientation="h",
	yanchor="bottom",
	y=-0.15,
	xanchor="center",
	x=0.5,
	font=dict(size=14),
	),
	title=dict(
	text="Model Comparison",
	x=0.5,
	y=0.95,
	font=dict(size=24, family="Arial", color="#1F2937"),
	),
	paper_bgcolor="white",
	plot_bgcolor="white",
	height=800,
	width=900,
	margin=dict(t=30, b=50, l=10, r=10),
	autosize=True,
	)

	return fig


	def get_performance_cost_chart(df, category_name="Overall"):
	colors = get_chart_colors()
	fig, ax = plt.subplots(figsize=(12, 8), dpi=300)

	fig.patch.set_facecolor(colors["background"])
	ax.set_facecolor(colors["background"])
	ax.grid(True, linestyle="--", alpha=0.15, which="both", color=colors["grid"])

	score_column = "Category Score"

	for _, row in df.iterrows():
	color = colors[row["Model Type"]]
	size = 100 if row[score_column] > 0.85 else 80
	edge_color = (
	colors["Private"]
	if row["Model Type"] == "Private"
	else colors["Open source"]
	)

	ax.scatter(
	row["IO Cost"],
	row[score_column] * 100,
	c=color,
	s=size,
	alpha=0.9,
	edgecolor=edge_color,
	linewidth=1,
	zorder=5,
	)

	bbox_props = dict(
	boxstyle="round,pad=0.3", fc=colors["background"], ec="none", alpha=0.8
	)

	ax.annotate(
	f"{row['Model']}\n(${row['IO Cost']:.2f})",
	(row["IO Cost"], row[score_column] * 100),
	xytext=(5, 5),
	textcoords="offset points",
	fontsize=8,
	fontweight="bold",
	color=colors["text"],
	bbox=bbox_props,
	zorder=6,
	)

	ax.set_xscale("log")
	ax.set_xlim(0.08, 1000)
	ax.set_ylim(60, 100)

	ax.set_xlabel(
	"I/O Cost per Million Tokens ($)",
	fontsize=10,
	fontweight="bold",
	labelpad=10,
	color=colors["text"],
	)
	ax.set_ylabel(
	"Model Performance Score",
	fontsize=10,
	fontweight="bold",
	labelpad=10,
	color=colors["text"],
	)

	legend_elements = [
	plt.scatter([], [], c=colors[label], label=label, s=80)
	for label in ["Private", "Open source"]
	]
	ax.legend(
	handles=legend_elements,
	loc="upper right",
	frameon=True,
	facecolor=colors["background"],
	edgecolor="none",
	fontsize=9,
	labelcolor=colors["text"],
	)

	ax.set_title(
	f"Performance vs. Cost - {category_name}",
	fontsize=14,
	pad=15,
	fontweight="bold",
	color=colors["text"],
	)

	for y1, y2, color in zip([85, 75, 60], [100, 85, 75], colors["performance_bands"]):
	ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)

	ax.tick_params(axis="both", which="major", labelsize=9, colors=colors["text"])
	ax.tick_params(axis="both", which="minor", labelsize=8, colors=colors["text"])
	ax.xaxis.set_minor_locator(plt.LogLocator(base=10.0, subs=np.arange(2, 10) * 0.1))

	for spine in ax.spines.values():
	spine.set_color(colors["grid"])

	plt.tight_layout()
	return fig