JeffYang52415's picture
refactor: remove system prompt
0450c4e unverified
from dataclasses import dataclass
from typing import Any, ClassVar
from llmdataparser.base_parser import (
DatasetDescription,
EvaluationMetric,
HuggingFaceDatasetParser,
HuggingFaceParseEntry,
)
@dataclass(frozen=True, kw_only=True, slots=True)
class MBPPParseEntry(HuggingFaceParseEntry):
"""Custom entry class for MBPP, with fields specific to this dataset parser."""
task_id: int
test_list: list[str]
test_setup_code: str
challenge_test_list: list[str]
source_file: str
@classmethod
def create(
cls,
question: str,
answer: str,
raw_question: str,
task_id: int,
test_list: list[str],
test_setup_code: str,
challenge_test_list: list[str],
task_name: str,
source_file: str,
) -> "MBPPParseEntry":
if not isinstance(task_id, int):
raise ValueError("Task ID must be an integer")
return cls(
question=question,
answer=answer,
raw_question=raw_question,
raw_answer=answer, # In MBPP, the code solution is the raw answer
task_id=task_id,
test_list=test_list,
test_setup_code=test_setup_code,
challenge_test_list=challenge_test_list,
task_name=task_name,
source_file=source_file,
)
class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
"""Parser for the MBPP (Mostly Basic Python Programming) dataset."""
_data_source: ClassVar[str] = "google-research-datasets/mbpp"
_default_task: ClassVar[str] = "full" # Can be 'full' or 'sanitized'
_task_names: ClassVar[list[str]] = ["full", "sanitized"]
def process_entry(
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
) -> MBPPParseEntry:
"""Process a single MBPP entry."""
raw_question = row.get("text", row.get("prompt"))
answer = row["code"]
task_id = row["task_id"]
test_list = row["test_list"]
test_setup_code = row.get("test_setup_code", "")
challenge_test_list = row.get("challenge_test_list", [])
question = str(raw_question)
# Use task_name if provided, otherwise use default
task = task_name or self._get_current_task(row)
source_file = row.get("source_file", "")
return MBPPParseEntry.create(
question=question,
answer=answer,
raw_question=raw_question,
task_id=task_id,
test_list=test_list,
test_setup_code=test_setup_code,
challenge_test_list=challenge_test_list,
task_name=task,
source_file=source_file,
)
def get_dataset_description(self) -> DatasetDescription:
"""Returns a description of the MBPP dataset."""
return DatasetDescription.create(
name="Mostly Basic Python Problems (MBPP)",
purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
source="https://github.com/google-research/google-research/tree/master/mbpp",
language="English and Python",
category=["Programming"],
format="Task descriptions in English with corresponding Python solutions and automated test cases",
characteristics=(
"Contains approximately 1,000 crowd-sourced Python programming problems "
"designed for entry-level programmers. Problems cover programming fundamentals "
"and standard library functionality. Each problem includes a task description, "
"code solution, and 3 automated test cases. A subset of the data has been "
"hand-verified by the authors."
),
citation=(
"@article{austin2021program,\n"
" title={Program Synthesis with Large Language Models},\n"
" author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n"
" journal={arXiv preprint arXiv:2108.07732},\n"
" year={2021}\n"
"}"
),
additional_info={
"size": "~1,000 programming problems",
"splits": "Available in full or sanitized versions",
"test_coverage": "Each problem includes 3 automated test cases",
"verification": "Subset of data has been hand-verified by authors",
},
)
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
"""Returns the recommended evaluation metrics for MBPP dataset."""
return [
EvaluationMetric.create(
name="pass@k",
type="code_evaluation",
description="Percentage of problems where at least one solution in k generations passes all test cases",
implementation="custom_pass_at_k",
primary=True,
),
EvaluationMetric.create(
name="test_case_success_rate",
type="code_evaluation",
description="Percentage of test cases passed across all problems",
implementation="custom_test_success_rate",
primary=False,
),
EvaluationMetric.create(
name="syntax_validity",
type="code_evaluation",
description="Verifies that generated code is syntactically valid Python",
implementation="custom_syntax_check",
primary=False,
),
EvaluationMetric.create(
name="code_similarity",
type="similarity",
description="Similarity between generated code and reference solution",
implementation="evaluate.load('code_eval')",
primary=False,
),
]
if __name__ == "__main__":
# Example usage
parser = MBPPDatasetParser()
# Load the dataset
parser.load()
# Parse all splits
parser.parse()
# Get parsed data
parsed_data = parser.get_parsed_data
# Print example entry
if parsed_data:
example = parsed_data[0]
print("\nExample parsed entry:")
print(f"Task ID: {example.task_id}")
print(f"Task: {example.raw_question}")
print(f"Solution:\n{example.answer}")
print(f"Test Cases:\n{example.test_list}")