Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Running

App Files Files Community

LLMEval-Dataset-Parser / llmdataparser /mbpp_parser.py

JeffYang52415

refactor: remove system prompt

0450c4e unverified 6 months ago

raw

history blame contribute delete

6.64 kB

	from dataclasses import dataclass
	from typing import Any, ClassVar

	from llmdataparser.base_parser import (
	DatasetDescription,
	EvaluationMetric,
	HuggingFaceDatasetParser,
	HuggingFaceParseEntry,
	)


	@dataclass(frozen=True, kw_only=True, slots=True)
	class MBPPParseEntry(HuggingFaceParseEntry):
	"""Custom entry class for MBPP, with fields specific to this dataset parser."""

	task_id: int
	test_list: list[str]
	test_setup_code: str
	challenge_test_list: list[str]
	source_file: str

	@classmethod
	def create(
	cls,
	question: str,
	answer: str,
	raw_question: str,
	task_id: int,
	test_list: list[str],
	test_setup_code: str,
	challenge_test_list: list[str],
	task_name: str,
	source_file: str,
	) -> "MBPPParseEntry":
	if not isinstance(task_id, int):
	raise ValueError("Task ID must be an integer")

	return cls(
	question=question,
	answer=answer,
	raw_question=raw_question,
	raw_answer=answer, # In MBPP, the code solution is the raw answer
	task_id=task_id,
	test_list=test_list,
	test_setup_code=test_setup_code,
	challenge_test_list=challenge_test_list,
	task_name=task_name,
	source_file=source_file,
	)


	class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
	"""Parser for the MBPP (Mostly Basic Python Programming) dataset."""

	_data_source: ClassVar[str] = "google-research-datasets/mbpp"
	_default_task: ClassVar[str] = "full" # Can be 'full' or 'sanitized'
	_task_names: ClassVar[list[str]] = ["full", "sanitized"]

	def process_entry(
	self, row: dict[str, Any], task_name: str \| None = None, **kwargs: Any
	) -> MBPPParseEntry:
	"""Process a single MBPP entry."""
	raw_question = row.get("text", row.get("prompt"))
	answer = row["code"]
	task_id = row["task_id"]
	test_list = row["test_list"]
	test_setup_code = row.get("test_setup_code", "")
	challenge_test_list = row.get("challenge_test_list", [])

	question = str(raw_question)

	# Use task_name if provided, otherwise use default
	task = task_name or self._get_current_task(row)
	source_file = row.get("source_file", "")

	return MBPPParseEntry.create(
	question=question,
	answer=answer,
	raw_question=raw_question,
	task_id=task_id,
	test_list=test_list,
	test_setup_code=test_setup_code,
	challenge_test_list=challenge_test_list,
	task_name=task,
	source_file=source_file,
	)

	def get_dataset_description(self) -> DatasetDescription:
	"""Returns a description of the MBPP dataset."""
	return DatasetDescription.create(
	name="Mostly Basic Python Problems (MBPP)",
	purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
	source="https://github.com/google-research/google-research/tree/master/mbpp",
	language="English and Python",
	category=["Programming"],
	format="Task descriptions in English with corresponding Python solutions and automated test cases",
	characteristics=(
	"Contains approximately 1,000 crowd-sourced Python programming problems "
	"designed for entry-level programmers. Problems cover programming fundamentals "
	"and standard library functionality. Each problem includes a task description, "
	"code solution, and 3 automated test cases. A subset of the data has been "
	"hand-verified by the authors."
	),
	citation=(
	"@article{austin2021program,\n"
	" title={Program Synthesis with Large Language Models},\n"
	" author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n"
	" journal={arXiv preprint arXiv:2108.07732},\n"
	" year={2021}\n"
	"}"
	),
	additional_info={
	"size": "~1,000 programming problems",
	"splits": "Available in full or sanitized versions",
	"test_coverage": "Each problem includes 3 automated test cases",
	"verification": "Subset of data has been hand-verified by authors",
	},
	)

	def get_evaluation_metrics(self) -> list[EvaluationMetric]:
	"""Returns the recommended evaluation metrics for MBPP dataset."""
	return [
	EvaluationMetric.create(
	name="pass@k",
	type="code_evaluation",
	description="Percentage of problems where at least one solution in k generations passes all test cases",
	implementation="custom_pass_at_k",
	primary=True,
	),
	EvaluationMetric.create(
	name="test_case_success_rate",
	type="code_evaluation",
	description="Percentage of test cases passed across all problems",
	implementation="custom_test_success_rate",
	primary=False,
	),
	EvaluationMetric.create(
	name="syntax_validity",
	type="code_evaluation",
	description="Verifies that generated code is syntactically valid Python",
	implementation="custom_syntax_check",
	primary=False,
	),
	EvaluationMetric.create(
	name="code_similarity",
	type="similarity",
	description="Similarity between generated code and reference solution",
	implementation="evaluate.load('code_eval')",
	primary=False,
	),
	]


	if __name__ == "__main__":
	# Example usage
	parser = MBPPDatasetParser()

	# Load the dataset
	parser.load()

	# Parse all splits
	parser.parse()

	# Get parsed data
	parsed_data = parser.get_parsed_data

	# Print example entry
	if parsed_data:
	example = parsed_data[0]
	print("\nExample parsed entry:")
	print(f"Task ID: {example.task_id}")
	print(f"Task: {example.raw_question}")
	print(f"Solution:\n{example.answer}")
	print(f"Test Cases:\n{example.test_list}")