turkish_mmlu_leaderboard / data_manager.py
alibayram's picture
Add configuration and data management for Gradio app, implement filtering, response search, and section results plotting functionalities
1c73b10
from typing import Optional, Dict
import pandas as pd
from functools import lru_cache
from huggingface_hub import snapshot_download
import logging
from config import CONFIG
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DataManager:
def __init__(self):
self._leaderboard_data: Optional[pd.DataFrame] = None
self._responses_data: Optional[pd.DataFrame] = None
self._section_results_data: Optional[pd.DataFrame] = None
@lru_cache(maxsize=1)
def _load_dataset(self, path: str) -> pd.DataFrame:
"""Load dataset with caching."""
try:
return pd.read_parquet(path)
except Exception as e:
logger.error(f"Error loading dataset from {path}: {e}")
raise RuntimeError(f"Failed to load dataset: {e}")
def refresh_datasets(self) -> None:
"""Refresh all datasets from source."""
try:
snapshot_download(
repo_id="alibayram",
repo_type="dataset",
local_dir=CONFIG["dataset"].cache_dir
)
# Clear cache to force reload
self._load_dataset.cache_clear()
logger.info("Datasets refreshed successfully")
except Exception as e:
logger.error(f"Error refreshing datasets: {e}")
@property
def leaderboard_data(self) -> pd.DataFrame:
if self._leaderboard_data is None:
self._leaderboard_data = self._load_dataset(CONFIG["dataset"].leaderboard_path)
return self._leaderboard_data
@property
def responses_data(self) -> pd.DataFrame:
if self._responses_data is None:
self._responses_data = self._load_dataset(CONFIG["dataset"].responses_path)
return self._responses_data
@property
def section_results_data(self) -> pd.DataFrame:
if self._section_results_data is None:
self._section_results_data = self._load_dataset(CONFIG["dataset"].section_results_path)
return self._section_results_data
# Global instance
data_manager = DataManager()