import gradio as gr from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download import pandas as pd import matplotlib.pyplot as plt # Dataset paths LEADERBOARD_PATH = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_liderlik_tablosu/data/train-00000-of-00001.parquet" RESPONSES_PATH = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_model_cevaplari/data/train-00000-of-00001.parquet" SECTION_RESULTS_PATH = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_bolum_sonuclari/data/train-00000-of-00001.parquet" # Load datasets try: leaderboard_data = pd.read_parquet(LEADERBOARD_PATH) model_responses_data = pd.read_parquet(RESPONSES_PATH) section_results_data = pd.read_parquet(SECTION_RESULTS_PATH) except Exception as e: print(f"Error loading datasets: {e}") raise # Helper functions def filter_leaderboard(family=None, quantization_level=None): df = leaderboard_data.copy() if family: df = df[df["family"] == family] if quantization_level: df = df[df["quantization_level"] == quantization_level] return df def search_responses(query, model): filtered = model_responses_data[model_responses_data["bolum"].str.contains(query, case=False)] selected_columns = ["bolum", "soru", "cevap", model + "_cevap"] return filtered[selected_columns] def plot_section_results(): fig, ax = plt.subplots(figsize=(10, 6)) avg_scores = section_results_data.mean(numeric_only=True) avg_scores.plot(kind="bar", ax=ax) ax.set_title("Average Section-Wise Performance") ax.set_ylabel("Accuracy (%)") ax.set_xlabel("Sections") return fig # Return the figure object def add_new_model(model_name, base_model, revision, precision, weight_type, model_type): # Simulated model submission logic return f"Model '{model_name}' submitted successfully!" # Gradio app structure with gr.Blocks(css=".container { max-width: 1200px; margin: auto; }") as app: gr.HTML("

🏆 Turkish MMLU Leaderboard

") gr.Markdown("Explore, evaluate, and compare AI model performance.") with gr.Tabs() as tabs: # Leaderboard Tab with gr.TabItem("Leaderboard"): family_filter = gr.Dropdown( choices=leaderboard_data["family"].unique().tolist(), label="Filter by Family", multiselect=False ) quantization_filter = gr.Dropdown( choices=leaderboard_data["quantization_level"].unique().tolist(), label="Filter by Quantization Level" ) leaderboard_table = gr.DataFrame(leaderboard_data) gr.Button("Apply Filters").click( filter_leaderboard, inputs=[family_filter, quantization_filter], outputs=leaderboard_table ) # Model Responses Tab with gr.TabItem("Model Responses"): model_dropdown = gr.Dropdown( choices=leaderboard_data["model"].unique().tolist(), label="Select Model" ) query_input = gr.Textbox(label="Search Query") responses_table = gr.DataFrame() gr.Button("Search").click( search_responses, inputs=[query_input, model_dropdown], outputs=responses_table ) # Section Results Tab with gr.TabItem("Section Results"): gr.Plot(plot_section_results) gr.DataFrame(section_results_data) # Submit Model Tab with gr.TabItem("Submit Model"): gr.Markdown("### Submit Your Model for Evaluation") model_name = gr.Textbox(label="Model Name") base_model = gr.Textbox(label="Base Model") revision = gr.Textbox(label="Revision", placeholder="main") precision = gr.Dropdown( choices=["float16", "int8", "bfloat16", "float32"], label="Precision", value="float16" ) weight_type = gr.Dropdown( choices=["Original", "Delta", "Adapter"], label="Weight Type", value="Original" ) model_type = gr.Dropdown( choices=["Transformer", "RNN", "GPT", "Other"], label="Model Type", value="Transformer" ) submit_button = gr.Button("Submit") submission_output = gr.Markdown() submit_button.click( add_new_model, inputs=[model_name, base_model, revision, precision, weight_type, model_type], outputs=submission_output, ) # Scheduler for refreshing datasets scheduler = BackgroundScheduler() scheduler.add_job( lambda: snapshot_download(repo_id="alibayram", repo_type="dataset", local_dir="cache"), "interval", seconds=1800 ) scheduler.start() # Launch app app.queue(default_concurrency_limit=40).launch()