|
|
|
|
|
|
|
import os |
|
from datasets import load_dataset |
|
from loguru import logger |
|
import json |
|
from dotenv import load_dotenv |
|
import sys |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
if not hf_token: |
|
logger.warning("HF_TOKEN not found in .env file. Access to private datasets may be limited.") |
|
|
|
|
|
logger.remove() |
|
logger.add( |
|
"logs/yourbench_dataset_exploration.log", |
|
level="INFO", |
|
rotation="10 MB", |
|
retention="1 week", |
|
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}" |
|
) |
|
|
|
logger.add( |
|
sys.stdout, |
|
level="INFO", |
|
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}" |
|
) |
|
|
|
logger.info("Starting YouRBench dataset exploration") |
|
|
|
try: |
|
|
|
dataset_name = "yourbench/yourbench_test" |
|
logger.info(f"Loading dataset: {dataset_name}") |
|
dataset = load_dataset(dataset_name, token=hf_token) |
|
|
|
|
|
logger.info(f"Dataset structure: {dataset}") |
|
|
|
|
|
for split_name, split_dataset in dataset.items(): |
|
logger.info(f"\n{'='*50}\nExploring split: {split_name}\n{'='*50}") |
|
logger.info(f"Number of examples: {len(split_dataset)}") |
|
logger.info(f"Features: {split_dataset.features}") |
|
|
|
|
|
num_samples = min(3, len(split_dataset)) |
|
logger.info(f"\nShowing {num_samples} sample examples:") |
|
|
|
for i in range(num_samples): |
|
example = split_dataset[i] |
|
|
|
example_json = json.dumps(example, indent=2, ensure_ascii=False) |
|
logger.info(f"\nExample {i}:\n{example_json}") |
|
|
|
|
|
if hasattr(split_dataset, 'column_names'): |
|
logger.info(f"\nColumn names: {split_dataset.column_names}") |
|
|
|
|
|
for column in split_dataset.column_names: |
|
try: |
|
if split_dataset.features[column].dtype in ['string', 'bool', 'int32', 'int64']: |
|
unique_values = set(split_dataset[column]) |
|
if len(unique_values) < 20: |
|
logger.info(f"Unique values in '{column}': {unique_values}") |
|
except Exception as e: |
|
logger.warning(f"Couldn't analyze column '{column}': {e}") |
|
|
|
except Exception as e: |
|
logger.error(f"Error exploring dataset: {e}") |
|
|
|
logger.info("Dataset exploration completed") |