tox21_leaderboard / backend /data_loader.py
Tschoui's picture
:bug: Resolve issues with access token
66a9b43
"""
Data loading functionality for the Tox21 leaderboard.
Handles loading and processing results from HuggingFace datasets.
"""
import pandas as pd
from datasets import load_dataset, Dataset
from config.settings import RESULTS_DATASET, TEST_DATASET, HF_TOKEN
from config.tasks import TOX21_TASKS
def load_leaderboard_data() -> Dataset:
"""
Load leaderboard data from HuggingFace dataset.
"""
print(f"Loading dataset: {RESULTS_DATASET}")
print(f"Using HF token: {'Yes' if HF_TOKEN else 'No'}")
# Load the dataset (token already set globally via login in settings)
dataset = load_dataset(RESULTS_DATASET)
print(f"Dataset loaded successfully. Keys: {dataset.keys()}")
# Look for test split (more appropriate for results)
if "test" in dataset:
results_data = dataset["test"]
print(f"Test split has {len(results_data)} entries")
if len(results_data) > 0:
print(f"First entry keys: {results_data[0].keys()}")
print(f"First entry: {results_data[0]}")
return results_data
else:
raise ValueError("Dataset does not contain a 'test' split.")
def load_test_dataset() -> tuple[list[str], list[dict[str, float]]]:
# Get test smiles and labels (token already set globally via login in settings)
dset = load_dataset(TEST_DATASET, split="test")
tasks = [t.key for t in TOX21_TASKS]
smiles = list(dset["smiles"])
labels = []
for sample in list(dset):
labels.append({task: sample[task] for task in tasks})
print(f"Loaded test dataset")
return smiles, labels