|
from typing import Optional, Dict |
|
import pandas as pd |
|
from functools import lru_cache |
|
from huggingface_hub import snapshot_download |
|
import logging |
|
from config import CONFIG |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
class DataManager: |
|
def __init__(self): |
|
self._leaderboard_data: Optional[pd.DataFrame] = None |
|
self._responses_data: Optional[pd.DataFrame] = None |
|
self._section_results_data: Optional[pd.DataFrame] = None |
|
|
|
@lru_cache(maxsize=1) |
|
def _load_dataset(self, path: str) -> pd.DataFrame: |
|
"""Load dataset with caching.""" |
|
try: |
|
return pd.read_parquet(path) |
|
except Exception as e: |
|
logger.error(f"Error loading dataset from {path}: {e}") |
|
raise RuntimeError(f"Failed to load dataset: {e}") |
|
|
|
def refresh_datasets(self) -> None: |
|
"""Refresh all datasets from source.""" |
|
try: |
|
snapshot_download( |
|
repo_id="alibayram", |
|
repo_type="dataset", |
|
local_dir=CONFIG["dataset"].cache_dir |
|
) |
|
|
|
self._load_dataset.cache_clear() |
|
logger.info("Datasets refreshed successfully") |
|
except Exception as e: |
|
logger.error(f"Error refreshing datasets: {e}") |
|
|
|
@property |
|
def leaderboard_data(self) -> pd.DataFrame: |
|
if self._leaderboard_data is None: |
|
self._leaderboard_data = self._load_dataset(CONFIG["dataset"].leaderboard_path) |
|
return self._leaderboard_data |
|
|
|
@property |
|
def responses_data(self) -> pd.DataFrame: |
|
if self._responses_data is None: |
|
self._responses_data = self._load_dataset(CONFIG["dataset"].responses_path) |
|
return self._responses_data |
|
|
|
@property |
|
def section_results_data(self) -> pd.DataFrame: |
|
if self._section_results_data is None: |
|
self._section_results_data = self._load_dataset(CONFIG["dataset"].section_results_path) |
|
return self._section_results_data |
|
|
|
|
|
data_manager = DataManager() |