alibayram's picture
Add Gradio interface for LLM benchmarking and evaluation submission
3b81b14
raw
history blame
2.72 kB
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
# Load datasets
leaderboard_data = pd.read_parquet(
"hf://datasets/alibayram/yapay_zeka_turkce_mmlu_liderlik_tablosu/data/train-00000-of-00001.parquet"
)
model_responses_data = pd.read_parquet(
"hf://datasets/alibayram/yapay_zeka_turkce_mmlu_model_cevaplari/data/train-00000-of-00001.parquet"
)
section_results_data = pd.read_parquet(
"hf://datasets/alibayram/yapay_zeka_turkce_mmlu_bolum_sonuclari/data/train-00000-of-00001.parquet"
)
# Leaderboard Tab
def get_leaderboard(sort_by="Accuracy"):
return leaderboard_data.sort_values(by=sort_by, ascending=False)
# Model Responses Tab
def search_model_responses(query, model):
filtered = model_responses_data[
(model_responses_data["model"] == model) &
(model_responses_data["question"].str.contains(query, case=False))
]
return filtered
# Section Results Tab
def plot_section_results():
fig, ax = plt.subplots(figsize=(10, 6))
section_results_data.groupby("section")["accuracy"].mean().plot(kind="bar", ax=ax)
ax.set_title("Section-Wise Performance")
ax.set_ylabel("Accuracy (%)")
ax.set_xlabel("Section")
return fig
# Model Comparison Tab
def compare_models(models):
comparison = leaderboard_data[leaderboard_data["model"].isin(models)]
return comparison
# Gradio Interface
with gr.Blocks() as app:
gr.Markdown("# πŸ† Turkish MMLU Leaderboard")
gr.Markdown("Explore the performance of AI models on Turkish MMLU benchmarks.")
with gr.Tab("Leaderboard"):
sort_by = gr.Dropdown(
["Accuracy", "Runtime", "Model Name"],
label="Sort By",
value="Accuracy"
)
leaderboard_table = gr.DataFrame(value=leaderboard_data)
sort_by.change(get_leaderboard, inputs=sort_by, outputs=leaderboard_table)
with gr.Tab("Model Responses"):
model_dropdown = gr.Dropdown(
leaderboard_data["model"].unique(), label="Select Model"
)
query_input = gr.Textbox(label="Search Query")
responses_output = gr.DataFrame()
query_input.change(
search_model_responses,
inputs=[query_input, model_dropdown],
outputs=responses_output,
)
with gr.Tab("Section Results"):
gr.Markdown("### Section-Wise Results")
gr.Plot(plot_section_results)
with gr.Tab("Model Comparison"):
model_select = gr.CheckboxGroup(
options=leaderboard_data["model"].unique(), label="Select Models"
)
comparison_table = gr.DataFrame()
model_select.change(compare_models, inputs=model_select, outputs=comparison_table)
app.launch()