File size: 6,637 Bytes
18bf871 0772011 18bf871 0450c4e 18bf871 0450c4e 18bf871 0450c4e 18bf871 0450c4e 18bf871 0772011 a06316f 0772011 18bf871 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
from dataclasses import dataclass
from typing import Any, ClassVar
from llmdataparser.base_parser import (
DatasetDescription,
EvaluationMetric,
HuggingFaceDatasetParser,
HuggingFaceParseEntry,
)
@dataclass(frozen=True, kw_only=True, slots=True)
class MBPPParseEntry(HuggingFaceParseEntry):
"""Custom entry class for MBPP, with fields specific to this dataset parser."""
task_id: int
test_list: list[str]
test_setup_code: str
challenge_test_list: list[str]
source_file: str
@classmethod
def create(
cls,
question: str,
answer: str,
raw_question: str,
task_id: int,
test_list: list[str],
test_setup_code: str,
challenge_test_list: list[str],
task_name: str,
source_file: str,
) -> "MBPPParseEntry":
if not isinstance(task_id, int):
raise ValueError("Task ID must be an integer")
return cls(
question=question,
answer=answer,
raw_question=raw_question,
raw_answer=answer, # In MBPP, the code solution is the raw answer
task_id=task_id,
test_list=test_list,
test_setup_code=test_setup_code,
challenge_test_list=challenge_test_list,
task_name=task_name,
source_file=source_file,
)
class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
"""Parser for the MBPP (Mostly Basic Python Programming) dataset."""
_data_source: ClassVar[str] = "google-research-datasets/mbpp"
_default_task: ClassVar[str] = "full" # Can be 'full' or 'sanitized'
_task_names: ClassVar[list[str]] = ["full", "sanitized"]
def process_entry(
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
) -> MBPPParseEntry:
"""Process a single MBPP entry."""
raw_question = row.get("text", row.get("prompt"))
answer = row["code"]
task_id = row["task_id"]
test_list = row["test_list"]
test_setup_code = row.get("test_setup_code", "")
challenge_test_list = row.get("challenge_test_list", [])
question = str(raw_question)
# Use task_name if provided, otherwise use default
task = task_name or self._get_current_task(row)
source_file = row.get("source_file", "")
return MBPPParseEntry.create(
question=question,
answer=answer,
raw_question=raw_question,
task_id=task_id,
test_list=test_list,
test_setup_code=test_setup_code,
challenge_test_list=challenge_test_list,
task_name=task,
source_file=source_file,
)
def get_dataset_description(self) -> DatasetDescription:
"""Returns a description of the MBPP dataset."""
return DatasetDescription.create(
name="Mostly Basic Python Problems (MBPP)",
purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
source="https://github.com/google-research/google-research/tree/master/mbpp",
language="English and Python",
category=["Programming"],
format="Task descriptions in English with corresponding Python solutions and automated test cases",
characteristics=(
"Contains approximately 1,000 crowd-sourced Python programming problems "
"designed for entry-level programmers. Problems cover programming fundamentals "
"and standard library functionality. Each problem includes a task description, "
"code solution, and 3 automated test cases. A subset of the data has been "
"hand-verified by the authors."
),
citation=(
"@article{austin2021program,\n"
" title={Program Synthesis with Large Language Models},\n"
" author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n"
" journal={arXiv preprint arXiv:2108.07732},\n"
" year={2021}\n"
"}"
),
additional_info={
"size": "~1,000 programming problems",
"splits": "Available in full or sanitized versions",
"test_coverage": "Each problem includes 3 automated test cases",
"verification": "Subset of data has been hand-verified by authors",
},
)
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
"""Returns the recommended evaluation metrics for MBPP dataset."""
return [
EvaluationMetric.create(
name="pass@k",
type="code_evaluation",
description="Percentage of problems where at least one solution in k generations passes all test cases",
implementation="custom_pass_at_k",
primary=True,
),
EvaluationMetric.create(
name="test_case_success_rate",
type="code_evaluation",
description="Percentage of test cases passed across all problems",
implementation="custom_test_success_rate",
primary=False,
),
EvaluationMetric.create(
name="syntax_validity",
type="code_evaluation",
description="Verifies that generated code is syntactically valid Python",
implementation="custom_syntax_check",
primary=False,
),
EvaluationMetric.create(
name="code_similarity",
type="similarity",
description="Similarity between generated code and reference solution",
implementation="evaluate.load('code_eval')",
primary=False,
),
]
if __name__ == "__main__":
# Example usage
parser = MBPPDatasetParser()
# Load the dataset
parser.load()
# Parse all splits
parser.parse()
# Get parsed data
parsed_data = parser.get_parsed_data
# Print example entry
if parsed_data:
example = parsed_data[0]
print("\nExample parsed entry:")
print(f"Task ID: {example.task_id}")
print(f"Task: {example.raw_question}")
print(f"Solution:\n{example.answer}")
print(f"Test Cases:\n{example.test_list}")
|