polish_eq-bench / app2.py
djstrong's picture
eq bench
397694c
raw
history blame
3.13 kB
import re
import gradio as gr
import numpy
import pandas as pd
from src.display.css_html_js import custom_css
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
# load dataframe from csv
# leaderboard_df = pd.read_csv("benchmark_results.csv")
leaderboard_df = []
with open("benchmark_results.csv", "r") as f:
header = f.readline().strip().split(",")
header = [h.strip() for h in header]
for i, line in enumerate(f):
leaderboard_df.append(line.strip().split(",", 13))
# create dataframe from list and header
leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
# filter column with value eq-bench_v2_pl
print(header)
leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (leaderboard_df["Benchmark Version"]=='eq-bench_pl')]
#fix: ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
#leave only defined columns
leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]
#create new column with model name
def parse_parseable(x):
if x["Num Questions Parseable"] == 'FAILED':
m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
print(m.group(1))
return m.group(1)
return x["Num Questions Parseable"]
leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(lambda x: parse_parseable(x), axis=1)
#change value of column to nan
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
#set datatype of column
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
#set nan if value of column is less than 0
leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0
#sort by 2 columns
leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"], ascending=[False, False])
leaderboard_df_styled = leaderboard_df.style.background_gradient(cmap="RdYlGn")
rounding = {}
# for col in ["Benchmark Score", "Num Questions Parseable"]:
rounding["Benchmark Score"] = "{:.2f}"
rounding["Num Questions Parseable"] = "{:.0f}"
leaderboard_df_styled = leaderboard_df_styled.format(rounding)
leaderboard_table = gr.components.Dataframe(
value=leaderboard_df_styled,
# headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
# datatype=TYPES,
elem_id="leaderboard-table",
interactive=False,
visible=True,
)
demo.queue(default_concurrency_limit=40).launch()