| | """ |
| | BIG-Bench Hard Benchmark Module |
| | |
| | This module implements the BIGBenchHard benchmark evaluation framework. |
| | BIGBenchHard is a challenging subset of 23 tasks from the BIG-bench evaluation suite, |
| | designed to test reasoning capabilities of language models. |
| | """ |
| |
|
| | import os |
| | import random |
| | import numpy as np |
| | import torch |
| | from typing import Any, List, Optional |
| |
|
| | from .benchmark import Benchmark |
| | from .measures import exact_match_score |
| | from ..core.logging import logger |
| | from ..core.module_utils import load_json |
| | from ..utils.utils import download_file |
| |
|
| | |
| | MULTIPLE_CHOICE_TASKS = [ |
| | 'temporal_sequences', 'disambiguation_qa', 'date_understanding', 'tracking_shuffled_objects_three_objects', 'penguins_in_a_table', |
| | 'geometric_shapes', 'snarks', 'ruin_names', 'tracking_shuffled_objects_seven_objects', 'tracking_shuffled_objects_five_objects', |
| | 'logical_deduction_three_objects', 'hyperbaton', 'logical_deduction_five_objects', 'logical_deduction_seven_objects', 'movie_recommendation', |
| | 'salient_translation_error_detection', 'reasoning_about_colored_objects', |
| | ] |
| |
|
| | FREE_FORM_TASKS = [ |
| | 'multistep_arithmetic_two', 'navigate', 'dyck_languages', 'word_sorting', 'sports_understanding', |
| | 'boolean_expressions', 'object_counting', 'formal_fallacies', 'causal_judgement', 'web_of_lies', |
| | ] |
| |
|
| | |
| | ALL_TASKS = {task: f"{task}.json" for task in MULTIPLE_CHOICE_TASKS + FREE_FORM_TASKS} |
| |
|
| | def download_raw_bigbenchhard_data(task_name: str, save_folder: str): |
| | """ |
| | Download raw BIGBenchHard data for a specific task. |
| | |
| | Args: |
| | task_name: The name of the task to download |
| | save_folder: Directory to save the downloaded data file |
| | |
| | Raises: |
| | AssertionError: If task_name is not a valid BIGBenchHard task |
| | """ |
| | assert task_name in ALL_TASKS, f"'{task_name}' is an invalid bigbenchhard task name. Available tasks: {list(ALL_TASKS.keys())}" |
| | file_name = ALL_TASKS[task_name] |
| | url = f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/bbh/{file_name}" |
| | logger.info(f"Downloading BIGBenchHard '{task_name}' data from: {url}") |
| | download_file(url=url, save_file=os.path.join(save_folder, file_name)) |
| | |
| | def set_seed(seed: int): |
| | """ |
| | Set random seeds for reproducibility across different libraries. |
| | |
| | Args: |
| | seed: The random seed value to use |
| | """ |
| | random.seed(seed) |
| | np.random.seed(seed) |
| | torch.manual_seed(seed) |
| | torch.cuda.manual_seed_all(seed) |
| |
|
| | class BIGBenchHard(Benchmark): |
| | """ |
| | Benchmark class for BIGBenchHard dataset evaluation. |
| | |
| | BIGBenchHard is a subset of 23 challenging tasks from the BIG-bench evaluation suite. |
| | Each task example has the following structure: |
| | { |
| | "input": str, # The input question/problem |
| | "target": str # The expected answer/output |
| | } |
| | |
| | The benchmark supports automatic data splitting for training/validation purposes |
| | and evaluates predictions using exact match scoring. |
| | """ |
| |
|
| | def __init__(self, task: str, path: str = None, mode: str = "all", dev_sample_num: int = 0, seed: int = 10, **kwargs): |
| | """ |
| | Initialize BIGBenchHard benchmark. |
| | |
| | Args: |
| | task: The specific BIGBenchHard task name |
| | path: Path to store the dataset. Defaults to ~/.evoagentx/data/bigbenchhard/{task} |
| | mode: Data loading mode. Defaults to "all" |
| | dev_sample_num: Number of samples to use for dev set. If 0, all data goes to test set |
| | seed: Random seed for reproducibility. Defaults to 10 |
| | **kwargs: Additional parameters for customization |
| | |
| | Raises: |
| | ValueError: If task is not a valid BIGBenchHard task name |
| | """ |
| | if task not in ALL_TASKS: |
| | raise ValueError(f"Unknown task '{task}'. Available tasks: {list(ALL_TASKS.keys())}") |
| | |
| | self.task = task |
| | self.file_name = ALL_TASKS[task] |
| | self.dev_sample_num = dev_sample_num |
| | self.seed = seed |
| | |
| | |
| | path = os.path.expanduser(path or f"~/.evoagentx/data/bigbenchhard/{task}") |
| | |
| | super().__init__(name=f"BIGBenchHard-{self.task}", path=path, mode=mode, **kwargs) |
| |
|
| | def _load_data_from_file(self, file_name: str) -> Optional[List[dict]]: |
| | """ |
| | Load data from a specific file. |
| | |
| | Args: |
| | file_name: Name of the file to load |
| | |
| | Returns: |
| | List of loaded examples or None if file doesn't exist |
| | """ |
| | if file_name is None: |
| | return None |
| | |
| | file_path = os.path.join(self.path, file_name) |
| | |
| | |
| | if not os.path.exists(file_path): |
| | download_raw_bigbenchhard_data(task_name=self.task, save_folder=self.path) |
| | |
| | logger.info(f"Loading BIGBenchHard data from {file_path}...") |
| | data = load_json(path=file_path, type="json") |
| | return data.get("examples", []) |
| |
|
| | def _load_data(self): |
| | """ |
| | Load and split data according to mode and dev_sample_num settings. |
| | |
| | Data splitting logic: |
| | - If dev_sample_num > 0: randomly samples examples for dev set, rest go to test set |
| | - If dev_sample_num = 0: all data goes to test set for evaluation |
| | - No training data provided (BIGBenchHard is designed for few-shot evaluation) |
| | """ |
| | |
| | task_data = self._load_data_from_file(file_name=self.file_name) |
| | |
| | |
| | if task_data is None: |
| | logger.warning(f"No data loaded for task {self.task}") |
| | self._train_data = [] |
| | self._dev_data = [] |
| | self._test_data = [] |
| | return |
| | |
| | |
| | self._train_data = [] |
| | |
| | |
| | if self.dev_sample_num > 0 and len(task_data) > self.dev_sample_num: |
| | logger.info(f"Sampling {self.dev_sample_num} examples for dev set, rest for test set.") |
| | if self.seed is not None: |
| | set_seed(self.seed) |
| | dev_subset = random.sample(task_data, self.dev_sample_num) |
| | self._dev_data = dev_subset |
| | self._test_data = [item for item in task_data if item not in dev_subset] |
| | |
| | else: |
| | |
| | if self.dev_sample_num > 0: |
| | logger.warning(f"dev_sample_num ({self.dev_sample_num}) >= total data size ({len(task_data)}). " |
| | f"Using all data for dev set, none for test set.") |
| | self._dev_data = task_data |
| | self._test_data = [] |
| | else: |
| | logger.info("dev_sample_num is 0, using all data for test set.") |
| | self._dev_data = [] |
| | self._test_data = task_data |
| | |
| | def get_input_keys(self) -> List[str]: |
| | """ |
| | Return the input keys expected by the benchmark. |
| | |
| | Returns: |
| | List containing "input" as the key for the problem text |
| | """ |
| | return ["input"] |
| | |
| | def _get_label(self, example: Any) -> Any: |
| | """ |
| | Extract the ground truth label from an example. |
| | |
| | Args: |
| | example: The benchmark example |
| | |
| | Returns: |
| | The target answer/label |
| | """ |
| | return example["target"] |
| | |
| | def _get_id(self, example: Any) -> Any: |
| | """ |
| | Extract the unique identifier from an example. |
| | |
| | BIGBenchHard examples don't have explicit IDs, so we use input text as identifier. |
| | |
| | Args: |
| | example: The benchmark example |
| | |
| | Returns: |
| | The input text as a unique identifier |
| | """ |
| | return example.get("input", None) |
| | |
| | def evaluate(self, prediction: Any, label: Any) -> dict: |
| | """ |
| | Score a prediction against the ground truth label. |
| | |
| | Uses exact match scoring with task-specific handling for certain tasks. |
| | |
| | Args: |
| | prediction: The predicted answer |
| | label: The ground truth answer |
| | |
| | Returns: |
| | Dictionary containing the exact match score |
| | """ |
| | if self.task == "dyck_languages": |
| | |
| | em = prediction.replace(' ', '') == label.replace(' ', '') |
| | return {"em": em} |
| | else: |
| | |
| | em = exact_match_score(prediction=prediction, ground_truth=label) |
| | return {"em": em} |