File size: 6,637 Bytes
18bf871
 
 
0772011
 
 
 
 
 
18bf871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
18bf871
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
18bf871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
18bf871
 
 
 
 
 
0450c4e
18bf871
 
 
 
 
 
 
 
 
 
0772011
 
 
 
 
 
 
a06316f
0772011
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18bf871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
from dataclasses import dataclass
from typing import Any, ClassVar

from llmdataparser.base_parser import (
    DatasetDescription,
    EvaluationMetric,
    HuggingFaceDatasetParser,
    HuggingFaceParseEntry,
)


@dataclass(frozen=True, kw_only=True, slots=True)
class MBPPParseEntry(HuggingFaceParseEntry):
    """Custom entry class for MBPP, with fields specific to this dataset parser."""

    task_id: int
    test_list: list[str]
    test_setup_code: str
    challenge_test_list: list[str]
    source_file: str

    @classmethod
    def create(
        cls,
        question: str,
        answer: str,
        raw_question: str,
        task_id: int,
        test_list: list[str],
        test_setup_code: str,
        challenge_test_list: list[str],
        task_name: str,
        source_file: str,
    ) -> "MBPPParseEntry":
        if not isinstance(task_id, int):
            raise ValueError("Task ID must be an integer")

        return cls(
            question=question,
            answer=answer,
            raw_question=raw_question,
            raw_answer=answer,  # In MBPP, the code solution is the raw answer
            task_id=task_id,
            test_list=test_list,
            test_setup_code=test_setup_code,
            challenge_test_list=challenge_test_list,
            task_name=task_name,
            source_file=source_file,
        )


class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
    """Parser for the MBPP (Mostly Basic Python Programming) dataset."""

    _data_source: ClassVar[str] = "google-research-datasets/mbpp"
    _default_task: ClassVar[str] = "full"  # Can be 'full' or 'sanitized'
    _task_names: ClassVar[list[str]] = ["full", "sanitized"]

    def process_entry(
        self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
    ) -> MBPPParseEntry:
        """Process a single MBPP entry."""
        raw_question = row.get("text", row.get("prompt"))
        answer = row["code"]
        task_id = row["task_id"]
        test_list = row["test_list"]
        test_setup_code = row.get("test_setup_code", "")
        challenge_test_list = row.get("challenge_test_list", [])

        question = str(raw_question)

        # Use task_name if provided, otherwise use default
        task = task_name or self._get_current_task(row)
        source_file = row.get("source_file", "")

        return MBPPParseEntry.create(
            question=question,
            answer=answer,
            raw_question=raw_question,
            task_id=task_id,
            test_list=test_list,
            test_setup_code=test_setup_code,
            challenge_test_list=challenge_test_list,
            task_name=task,
            source_file=source_file,
        )

    def get_dataset_description(self) -> DatasetDescription:
        """Returns a description of the MBPP dataset."""
        return DatasetDescription.create(
            name="Mostly Basic Python Problems (MBPP)",
            purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
            source="https://github.com/google-research/google-research/tree/master/mbpp",
            language="English and Python",
            category=["Programming"],
            format="Task descriptions in English with corresponding Python solutions and automated test cases",
            characteristics=(
                "Contains approximately 1,000 crowd-sourced Python programming problems "
                "designed for entry-level programmers. Problems cover programming fundamentals "
                "and standard library functionality. Each problem includes a task description, "
                "code solution, and 3 automated test cases. A subset of the data has been "
                "hand-verified by the authors."
            ),
            citation=(
                "@article{austin2021program,\n"
                "  title={Program Synthesis with Large Language Models},\n"
                "  author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n"
                "  journal={arXiv preprint arXiv:2108.07732},\n"
                "  year={2021}\n"
                "}"
            ),
            additional_info={
                "size": "~1,000 programming problems",
                "splits": "Available in full or sanitized versions",
                "test_coverage": "Each problem includes 3 automated test cases",
                "verification": "Subset of data has been hand-verified by authors",
            },
        )

    def get_evaluation_metrics(self) -> list[EvaluationMetric]:
        """Returns the recommended evaluation metrics for MBPP dataset."""
        return [
            EvaluationMetric.create(
                name="pass@k",
                type="code_evaluation",
                description="Percentage of problems where at least one solution in k generations passes all test cases",
                implementation="custom_pass_at_k",
                primary=True,
            ),
            EvaluationMetric.create(
                name="test_case_success_rate",
                type="code_evaluation",
                description="Percentage of test cases passed across all problems",
                implementation="custom_test_success_rate",
                primary=False,
            ),
            EvaluationMetric.create(
                name="syntax_validity",
                type="code_evaluation",
                description="Verifies that generated code is syntactically valid Python",
                implementation="custom_syntax_check",
                primary=False,
            ),
            EvaluationMetric.create(
                name="code_similarity",
                type="similarity",
                description="Similarity between generated code and reference solution",
                implementation="evaluate.load('code_eval')",
                primary=False,
            ),
        ]


if __name__ == "__main__":
    # Example usage
    parser = MBPPDatasetParser()

    # Load the dataset
    parser.load()

    # Parse all splits
    parser.parse()

    # Get parsed data
    parsed_data = parser.get_parsed_data

    # Print example entry
    if parsed_data:
        example = parsed_data[0]
        print("\nExample parsed entry:")
        print(f"Task ID: {example.task_id}")
        print(f"Task: {example.raw_question}")
        print(f"Solution:\n{example.answer}")
        print(f"Test Cases:\n{example.test_list}")