Spaces:

alibayram
/

turkish_mmlu_leaderboard

Running

App Files Files Community

alibayram commited on Nov 16, 2024

Commit

1c73b10

1 Parent(s): a7fa922

Add configuration and data management for Gradio app, implement filtering, response search, and section results plotting functionalities

Browse files

Files changed (5) hide show

app.py +117 -102
app_e.py +115 -0
config.py +33 -0
data_manager.py +59 -0
utils.py +67 -0

app.py CHANGED Viewed

@@ -1,115 +1,130 @@
 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-import pandas as pd
-import matplotlib.pyplot as plt
-# Dataset paths
-LEADERBOARD_PATH = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_liderlik_tablosu/data/train-00000-of-00001.parquet"
-RESPONSES_PATH = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_model_cevaplari/data/train-00000-of-00001.parquet"
-SECTION_RESULTS_PATH = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_bolum_sonuclari/data/train-00000-of-00001.parquet"
-# Load datasets
-try:
-    leaderboard_data = pd.read_parquet(LEADERBOARD_PATH)
-    model_responses_data = pd.read_parquet(RESPONSES_PATH)
-    section_results_data = pd.read_parquet(SECTION_RESULTS_PATH)
-except Exception as e:
-    print(f"Error loading datasets: {e}")
-    raise
-# Helper functions
-def filter_leaderboard(family=None, quantization_level=None):
-    df = leaderboard_data.copy()
-    if family:
-        df = df[df["family"] == family]
-    if quantization_level:
-        df = df[df["quantization_level"] == quantization_level]
-    return df
-def search_responses(query, model):
-    filtered = model_responses_data[model_responses_data["bolum"].str.contains(query, case=False)]
-    selected_columns = ["bolum", "soru", "cevap", model + "_cevap"]
-    return filtered[selected_columns]
-def plot_section_results():
-    fig, ax = plt.subplots(figsize=(10, 6))
-    avg_scores = section_results_data.mean(numeric_only=True)
-    avg_scores.plot(kind="bar", ax=ax)
-    ax.set_title("Average Section-Wise Performance")
-    ax.set_ylabel("Accuracy (%)")
-    ax.set_xlabel("Sections")
-    return fig  # Return the figure object
-def add_new_model(model_name, base_model, revision, precision, weight_type, model_type):
-    # Simulated model submission logic
-    return f"Model '{model_name}' submitted successfully!"
-# Gradio app structure
-with gr.Blocks(css=".container { max-width: 1200px; margin: auto; }") as app:
-    gr.HTML("<h1>🏆 Turkish MMLU Leaderboard</h1>")
-    gr.Markdown("Explore, evaluate, and compare AI model performance.")
-    with gr.Tabs() as tabs:
-        # Leaderboard Tab
-        with gr.TabItem("Leaderboard"):
-            family_filter = gr.Dropdown(
-                choices=leaderboard_data["family"].unique().tolist(), label="Filter by Family", multiselect=False
-            )
-            quantization_filter = gr.Dropdown(
-                choices=leaderboard_data["quantization_level"].unique().tolist(), label="Filter by Quantization Level"
-            )
-            leaderboard_table = gr.DataFrame(leaderboard_data)
-            gr.Button("Apply Filters").click(
-                filter_leaderboard, inputs=[family_filter, quantization_filter], outputs=leaderboard_table
-            )
-        # Model Responses Tab
-        with gr.TabItem("Model Responses"):
-            model_dropdown = gr.Dropdown(
-                choices=leaderboard_data["model"].unique().tolist(), label="Select Model"
-            )
-            query_input = gr.Textbox(label="Search Query")
-            responses_table = gr.DataFrame()
-            gr.Button("Search").click(
-                search_responses, inputs=[query_input, model_dropdown], outputs=responses_table
-            )
-        # Section Results Tab
-        with gr.TabItem("Section Results"):
-            gr.Plot(plot_section_results)
-            gr.DataFrame(section_results_data)
-        # Submit Model Tab
-        with gr.TabItem("Submit Model"):
-            gr.Markdown("### Submit Your Model for Evaluation")
-            model_name = gr.Textbox(label="Model Name")
-            base_model = gr.Textbox(label="Base Model")
-            revision = gr.Textbox(label="Revision", placeholder="main")
-            precision = gr.Dropdown(
-                choices=["float16", "int8", "bfloat16", "float32"], label="Precision", value="float16"
-            )
-            weight_type = gr.Dropdown(
-                choices=["Original", "Delta", "Adapter"], label="Weight Type", value="Original"
-            )
-            model_type = gr.Dropdown(
-                choices=["Transformer", "RNN", "GPT", "Other"], label="Model Type", value="Transformer"
-            )
-            submit_button = gr.Button("Submit")
-            submission_output = gr.Markdown()
-            submit_button.click(
-                add_new_model,
-                inputs=[model_name, base_model, revision, precision, weight_type, model_type],
-                outputs=submission_output,
-            )
-# Scheduler for refreshing datasets
-scheduler = BackgroundScheduler()
-scheduler.add_job(
-    lambda: snapshot_download(repo_id="alibayram", repo_type="dataset", local_dir="cache"),
-    "interval", seconds=1800
-)
-scheduler.start()
-# Launch app
-app.queue(default_concurrency_limit=40).launch()

 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
+from typing import Optional
+import logging
+from config import CONFIG
+from data_manager import data_manager
+from utils import filter_leaderboard, search_responses, plot_section_results, validate_model_submission
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def create_app() -> gr.Blocks:
+    """Create and configure the Gradio application."""
+    with gr.Blocks(css=CONFIG["ui"].css, theme=CONFIG["ui"].theme) as app:
+        gr.HTML(f"<h1>{CONFIG['ui'].title}</h1>")
+        gr.Markdown(CONFIG["ui"].description)
+        with gr.Tabs() as tabs:
+            # Leaderboard Tab
+            with gr.TabItem("📊 Leaderboard"):
+                with gr.Row():
+                    family_filter = gr.Dropdown(
+                        choices=data_manager.leaderboard_data["family"].unique().tolist(),
+                        label="Filter by Family",
+                        multiselect=False
+                    )
+                    quantization_filter = gr.Dropdown(
+                        choices=data_manager.leaderboard_data["quantization_level"].unique().tolist(),
+                        label="Filter by Quantization Level"
+                    )
+                filter_btn = gr.Button("Apply Filters", variant="primary")
+                leaderboard_table = gr.DataFrame(
+                    value=data_manager.leaderboard_data,
+                    interactive=False
+                )
+                filter_btn.click(
+                    filter_leaderboard,
+                    inputs=[family_filter, quantization_filter],
+                    outputs=leaderboard_table
+                )
+            # Model Responses Tab
+            with gr.TabItem("🔍 Model Responses"):
+                with gr.Row():
+                    model_dropdown = gr.Dropdown(
+                        choices=data_manager.leaderboard_data["model"].unique().tolist(),
+                        label="Select Model"
+                    )
+                    query_input = gr.Textbox(
+                        label="Search Query",
+                        placeholder="Enter search terms..."
+                    )
+                search_btn = gr.Button("Search", variant="primary")
+                responses_table = gr.DataFrame()
+                search_btn.click(
+                    search_responses,
+                    inputs=[query_input, model_dropdown],
+                    outputs=responses_table
+                )
+            # Section Results Tab
+            with gr.TabItem("📈 Section Results"):
+                gr.Plot(value=plot_section_results)
+                gr.DataFrame(value=data_manager.section_results_data)
+            # Submit Model Tab
+            with gr.TabItem("➕ Submit Model"):
+                gr.Markdown("### Submit Your Model for Evaluation")
+                with gr.Group():
+                    model_name = gr.Textbox(label="Model Name", placeholder="Enter unique model name")
+                    base_model = gr.Textbox(label="Base Model", placeholder="Enter base model name")
+                    revision = gr.Textbox(label="Revision", value="main")
+                    with gr.Row():
+                        precision = gr.Dropdown(
+                            choices=CONFIG["model"].precision_options,
+                            label="Precision",
+                            value="float16"
+                        )
+                        weight_type = gr.Dropdown(
+                            choices=CONFIG["model"].weight_types,
+                            label="Weight Type",
+                            value="Original"
+                        )
+                        model_type = gr.Dropdown(
+                            choices=CONFIG["model"].model_types,
+                            label="Model Type",
+                            value="Transformer"
+                        )
+                submit_btn = gr.Button("Submit Model", variant="primary")
+                submission_output = gr.Markdown()
+                def handle_submission(*args):
+                    is_valid, message = validate_model_submission(*args)
+                    if not is_valid:
+                        return f"❌ {message}"
+                    return "✅ Model submitted successfully!"
+                submit_btn.click(
+                    handle_submission,
+                    inputs=[model_name, base_model, revision, precision, weight_type, model_type],
+                    outputs=submission_output
+                )
+    return app
+def main():
+    # Initialize scheduler for data refresh
+    scheduler = BackgroundScheduler()
+    scheduler.add_job(
+        data_manager.refresh_datasets,
+        "interval",
+        seconds=CONFIG["dataset"].refresh_interval
+    )
+    scheduler.start()
+    # Create and launch app
+    app = create_app()
+    app.queue(default_concurrency_limit=40).launch()
+if __name__ == "__main__":
+    main()

app_e.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import gradio as gr
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+import pandas as pd
+import matplotlib.pyplot as plt
+# Dataset paths
+LEADERBOARD_PATH = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_liderlik_tablosu/data/train-00000-of-00001.parquet"
+RESPONSES_PATH = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_model_cevaplari/data/train-00000-of-00001.parquet"
+SECTION_RESULTS_PATH = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_bolum_sonuclari/data/train-00000-of-00001.parquet"
+# Load datasets
+try:
+    leaderboard_data = pd.read_parquet(LEADERBOARD_PATH)
+    model_responses_data = pd.read_parquet(RESPONSES_PATH)
+    section_results_data = pd.read_parquet(SECTION_RESULTS_PATH)
+except Exception as e:
+    print(f"Error loading datasets: {e}")
+    raise
+# Helper functions
+def filter_leaderboard(family=None, quantization_level=None):
+    df = leaderboard_data.copy()
+    if family:
+        df = df[df["family"] == family]
+    if quantization_level:
+        df = df[df["quantization_level"] == quantization_level]
+    return df
+def search_responses(query, model):
+    filtered = model_responses_data[model_responses_data["bolum"].str.contains(query, case=False)]
+    selected_columns = ["bolum", "soru", "cevap", model + "_cevap"]
+    return filtered[selected_columns]
+def plot_section_results():
+    fig, ax = plt.subplots(figsize=(10, 6))
+    avg_scores = section_results_data.mean(numeric_only=True)
+    avg_scores.plot(kind="bar", ax=ax)
+    ax.set_title("Average Section-Wise Performance")
+    ax.set_ylabel("Accuracy (%)")
+    ax.set_xlabel("Sections")
+    return fig  # Return the figure object
+def add_new_model(model_name, base_model, revision, precision, weight_type, model_type):
+    # Simulated model submission logic
+    return f"Model '{model_name}' submitted successfully!"
+# Gradio app structure
+with gr.Blocks(css=".container { max-width: 1200px; margin: auto; }") as app:
+    gr.HTML("<h1>🏆 Turkish MMLU Leaderboard</h1>")
+    gr.Markdown("Explore, evaluate, and compare AI model performance.")
+    with gr.Tabs() as tabs:
+        # Leaderboard Tab
+        with gr.TabItem("Leaderboard"):
+            family_filter = gr.Dropdown(
+                choices=leaderboard_data["family"].unique().tolist(), label="Filter by Family", multiselect=False
+            )
+            quantization_filter = gr.Dropdown(
+                choices=leaderboard_data["quantization_level"].unique().tolist(), label="Filter by Quantization Level"
+            )
+            leaderboard_table = gr.DataFrame(leaderboard_data)
+            gr.Button("Apply Filters").click(
+                filter_leaderboard, inputs=[family_filter, quantization_filter], outputs=leaderboard_table
+            )
+        # Model Responses Tab
+        with gr.TabItem("Model Responses"):
+            model_dropdown = gr.Dropdown(
+                choices=leaderboard_data["model"].unique().tolist(), label="Select Model"
+            )
+            query_input = gr.Textbox(label="Search Query")
+            responses_table = gr.DataFrame()
+            gr.Button("Search").click(
+                search_responses, inputs=[query_input, model_dropdown], outputs=responses_table
+            )
+        # Section Results Tab
+        with gr.TabItem("Section Results"):
+            gr.Plot(plot_section_results)
+            gr.DataFrame(section_results_data)
+        # Submit Model Tab
+        with gr.TabItem("Submit Model"):
+            gr.Markdown("### Submit Your Model for Evaluation")
+            model_name = gr.Textbox(label="Model Name")
+            base_model = gr.Textbox(label="Base Model")
+            revision = gr.Textbox(label="Revision", placeholder="main")
+            precision = gr.Dropdown(
+                choices=["float16", "int8", "bfloat16", "float32"], label="Precision", value="float16"
+            )
+            weight_type = gr.Dropdown(
+                choices=["Original", "Delta", "Adapter"], label="Weight Type", value="Original"
+            )
+            model_type = gr.Dropdown(
+                choices=["Transformer", "RNN", "GPT", "Other"], label="Model Type", value="Transformer"
+            )
+            submit_button = gr.Button("Submit")
+            submission_output = gr.Markdown()
+            submit_button.click(
+                add_new_model,
+                inputs=[model_name, base_model, revision, precision, weight_type, model_type],
+                outputs=submission_output,
+            )
+# Scheduler for refreshing datasets
+scheduler = BackgroundScheduler()
+scheduler.add_job(
+    lambda: snapshot_download(repo_id="alibayram", repo_type="dataset", local_dir="cache"),
+    "interval", seconds=1800
+)
+scheduler.start()
+# Launch app
+app.queue(default_concurrency_limit=40).launch()

config.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from dataclasses import dataclass
+from typing import Dict, List
+@dataclass
+class DatasetConfig:
+    leaderboard_path: str = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_liderlik_tablosu/data/train-00000-of-00001.parquet"
+    responses_path: str = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_model_cevaplari/data/train-00000-of-00001.parquet"
+    section_results_path: str = "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_bolum_sonuclari/data/train-00000-of-00001.parquet"
+    cache_dir: str = "cache"
+    refresh_interval: int = 1800  # 30 minutes
+@dataclass
+class UIConfig:
+    title: str = "🏆 Turkish MMLU Leaderboard"
+    description: str = "Explore, evaluate, and compare AI model performance."
+    theme: str = "default"
+    css: str = """
+    .container { max-width: 1200px; margin: auto; padding: 20px; }
+    .gr-button { min-width: 150px; }
+    .gr-box { border-radius: 8px; }
+    """
+@dataclass
+class ModelConfig:
+    precision_options: List[str] = ("float16", "int8", "bfloat16", "float32")
+    weight_types: List[str] = ("Original", "Delta", "Adapter")
+    model_types: List[str] = ("Transformer", "RNN", "GPT", "Other")
+CONFIG = {
+    "dataset": DatasetConfig(),
+    "ui": UIConfig(),
+    "model": ModelConfig(),
+}

data_manager.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from typing import Optional, Dict
+import pandas as pd
+from functools import lru_cache
+from huggingface_hub import snapshot_download
+import logging
+from config import CONFIG
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class DataManager:
+    def __init__(self):
+        self._leaderboard_data: Optional[pd.DataFrame] = None
+        self._responses_data: Optional[pd.DataFrame] = None
+        self._section_results_data: Optional[pd.DataFrame] = None
+    @lru_cache(maxsize=1)
+    def _load_dataset(self, path: str) -> pd.DataFrame:
+        """Load dataset with caching."""
+        try:
+            return pd.read_parquet(path)
+        except Exception as e:
+            logger.error(f"Error loading dataset from {path}: {e}")
+            raise RuntimeError(f"Failed to load dataset: {e}")
+    def refresh_datasets(self) -> None:
+        """Refresh all datasets from source."""
+        try:
+            snapshot_download(
+                repo_id="alibayram",
+                repo_type="dataset",
+                local_dir=CONFIG["dataset"].cache_dir
+            )
+            # Clear cache to force reload
+            self._load_dataset.cache_clear()
+            logger.info("Datasets refreshed successfully")
+        except Exception as e:
+            logger.error(f"Error refreshing datasets: {e}")
+    @property
+    def leaderboard_data(self) -> pd.DataFrame:
+        if self._leaderboard_data is None:
+            self._leaderboard_data = self._load_dataset(CONFIG["dataset"].leaderboard_path)
+        return self._leaderboard_data
+    @property
+    def responses_data(self) -> pd.DataFrame:
+        if self._responses_data is None:
+            self._responses_data = self._load_dataset(CONFIG["dataset"].responses_path)
+        return self._responses_data
+    @property
+    def section_results_data(self) -> pd.DataFrame:
+        if self._section_results_data is None:
+            self._section_results_data = self._load_dataset(CONFIG["dataset"].section_results_path)
+        return self._section_results_data
+# Global instance
+data_manager = DataManager()

utils.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from typing import Optional, Dict
+import pandas as pd
+import matplotlib.pyplot as plt
+from data_manager import data_manager
+def filter_leaderboard(
+    family: Optional[str] = None,
+    quantization_level: Optional[str] = None
+) -> pd.DataFrame:
+    """Filter leaderboard data based on criteria."""
+    df = data_manager.leaderboard_data.copy()
+    if family:
+        df = df[df["family"] == family]
+    if quantization_level:
+        df = df[df["quantization_level"] == quantization_level]
+    return df.sort_values("score", ascending=False)
+def search_responses(query: str, model: str) -> pd.DataFrame:
+    """Search model responses based on query."""
+    if not query or not model:
+        return pd.DataFrame()
+    filtered = data_manager.responses_data[
+        data_manager.responses_data["bolum"].str.contains(query, case=False, na=False)
+    ]
+    selected_columns = ["bolum", "soru", "cevap", f"{model}_cevap"]
+    return filtered[selected_columns].dropna()
+def plot_section_results() -> plt.Figure:
+    """Generate section results plot."""
+    fig, ax = plt.subplots(figsize=(12, 6))
+    avg_scores = data_manager.section_results_data.mean(numeric_only=True)
+    bars = avg_scores.plot(kind="bar", ax=ax)
+    # Customize plot
+    ax.set_title("Average Section-Wise Performance", pad=20)
+    ax.set_ylabel("Accuracy (%)")
+    ax.set_xlabel("Sections")
+    plt.xticks(rotation=45, ha='right')
+    plt.tight_layout()
+    # Add value labels
+    for i, v in enumerate(avg_scores):
+        ax.text(i, v, f'{v:.1f}%', ha='center', va='bottom')
+    return fig
+def validate_model_submission(
+    model_name: str,
+    base_model: str,
+    revision: str,
+    precision: str,
+    weight_type: str,
+    model_type: str
+) -> tuple[bool, str]:
+    """Validate model submission parameters."""
+    if not all([model_name, base_model]):
+        return False, "Model name and base model are required."
+    if model_name in data_manager.leaderboard_data["model"].values:
+        return False, "Model name already exists."
+    return True, "Validation successful"