File size: 6,160 Bytes
289c905
 
 
b73a2c7
 
 
 
 
 
289c905
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
289c905
 
 
 
 
 
 
 
 
0450c4e
289c905
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
289c905
 
 
 
 
0450c4e
289c905
 
 
 
 
 
 
 
 
b73a2c7
 
 
 
 
 
 
 
a06316f
b73a2c7
 
 
 
 
 
 
 
 
27ff91e
 
 
 
 
 
 
 
b73a2c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289c905
 
 
 
 
 
 
 
 
 
 
 
0450c4e
289c905
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from dataclasses import dataclass
from typing import Any, ClassVar, List

from llmdataparser.base_parser import (
    DatasetDescription,
    EvaluationMetric,
    HuggingFaceDatasetParser,
    HuggingFaceParseEntry,
)


@dataclass(frozen=True, kw_only=True, slots=True)
class IFEvalParseEntry(HuggingFaceParseEntry):
    """Custom entry class for IFEval, with fields specific to this dataset parser."""

    key: int
    instruction_id_list: List[str]
    kwargs: dict[str, Any]

    @classmethod
    def create(
        cls,
        question: str,
        answer: str,
        raw_question: str,
        raw_answer: str,
        key: int,
        instruction_id_list: List[str],
        kwargs: dict[str, Any],
        task_name: str,
    ) -> "IFEvalParseEntry":
        return cls(
            question=question,
            answer=answer,
            raw_question=raw_question,
            raw_answer=raw_answer,
            key=key,
            instruction_id_list=instruction_id_list,
            kwargs=kwargs,
            task_name=task_name,
        )


class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
    """Parser for the IFEval dataset."""

    _data_source: ClassVar[str] = "google/IFEval"
    _default_task: ClassVar[str] = "default"
    _task_names: ClassVar[list[str]] = ["default"]

    def process_entry(
        self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
    ) -> IFEvalParseEntry:
        """Process a single IFEval entry."""
        # Extract fields from the row
        key = row["key"]
        raw_question = row["prompt"]  # The prompt is the raw question in this case
        instruction_id_list = row["instruction_id_list"]
        kwargs_data = row["kwargs"]

        # For IFEval, we don't have explicit answers in the dataset
        # We'll use empty strings as placeholders
        answer = ""
        raw_answer = ""

        question = str(raw_question)

        # Use task_name if provided, otherwise use default
        task = task_name or self._get_current_task(row)

        return IFEvalParseEntry.create(
            question=question,
            answer=answer,
            raw_question=raw_question,
            raw_answer=raw_answer,
            key=key,
            instruction_id_list=instruction_id_list,
            kwargs=kwargs_data,
            task_name=task,
        )

    def get_dataset_description(self) -> DatasetDescription:
        """Returns description of the IFEval dataset."""
        return DatasetDescription.create(
            name="IFEval",
            purpose="Evaluate instruction following capabilities through verifiable instructions",
            source="Google Research",
            language="English (BCP-47 en)",
            format="Verifiable instruction prompts with automated evaluation criteria",
            category=["Programming"],
            characteristics=(
                "Collection of approximately 500 verifiable instructions designed to evaluate "
                "language models' instruction-following capabilities. Instructions include "
                "specific, measurable criteria like 'write in more than 400 words' or "
                "'mention the keyword AI at least 3 times' that can be verified through "
                "automated heuristics. Used as a core benchmark in the Open LLM Leaderboard "
                "for evaluating chat or instruction fine-tuned language models."
            ),
            citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
    title={Instruction-Following Evaluation for Large Language Models},
    author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
    year={2023},
    eprint={2311.07911},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2311.07911}
    }""",
        )

    def get_evaluation_metrics(self) -> list[EvaluationMetric]:
        """Returns recommended evaluation metrics for IFEval."""
        return [
            EvaluationMetric.create(
                name="format_compliance",
                type="text",
                description="Verifies if the output follows specified formatting rules (e.g., highlighting, bullet points, sections)",
                implementation="custom_format_checker",
                primary=True,
            ),
            EvaluationMetric.create(
                name="length_constraints",
                type="text",
                description="Checks if the response meets word, sentence, or paragraph count requirements",
                implementation="custom_length_validator",
                primary=True,
            ),
            EvaluationMetric.create(
                name="punctuation_rules",
                type="text",
                description="Validates adherence to punctuation constraints (e.g., no commas, specific endings)",
                implementation="custom_punctuation_checker",
                primary=True,
            ),
            EvaluationMetric.create(
                name="keyword_usage",
                type="text",
                description="Verifies correct usage of required keywords or avoidance of forbidden words",
                implementation="custom_keyword_validator",
                primary=False,
            ),
            EvaluationMetric.create(
                name="structural_requirements",
                type="text",
                description="Checks for specific structural elements like sections, paragraphs, or formatting patterns",
                implementation="custom_structure_validator",
                primary=False,
            ),
        ]


if __name__ == "__main__":
    # Example usage
    parser = IFEvalDatasetParser()
    parser.load()
    parser.parse()

    parsed_data = parser.get_parsed_data
    if parsed_data:
        example = parsed_data[0]
        print("\nExample parsed entry:")
        print(f"Key: {example.key}")
        print(f"Question: {example.question}")
        print(f"Instruction IDs: {example.instruction_id_list}")
        print(f"kwargs: {example.kwargs}")