File size: 6,160 Bytes
289c905 b73a2c7 289c905 0450c4e 289c905 0450c4e 289c905 0450c4e 289c905 0450c4e 289c905 b73a2c7 a06316f b73a2c7 27ff91e b73a2c7 289c905 0450c4e 289c905 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
from dataclasses import dataclass
from typing import Any, ClassVar, List
from llmdataparser.base_parser import (
DatasetDescription,
EvaluationMetric,
HuggingFaceDatasetParser,
HuggingFaceParseEntry,
)
@dataclass(frozen=True, kw_only=True, slots=True)
class IFEvalParseEntry(HuggingFaceParseEntry):
"""Custom entry class for IFEval, with fields specific to this dataset parser."""
key: int
instruction_id_list: List[str]
kwargs: dict[str, Any]
@classmethod
def create(
cls,
question: str,
answer: str,
raw_question: str,
raw_answer: str,
key: int,
instruction_id_list: List[str],
kwargs: dict[str, Any],
task_name: str,
) -> "IFEvalParseEntry":
return cls(
question=question,
answer=answer,
raw_question=raw_question,
raw_answer=raw_answer,
key=key,
instruction_id_list=instruction_id_list,
kwargs=kwargs,
task_name=task_name,
)
class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
"""Parser for the IFEval dataset."""
_data_source: ClassVar[str] = "google/IFEval"
_default_task: ClassVar[str] = "default"
_task_names: ClassVar[list[str]] = ["default"]
def process_entry(
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
) -> IFEvalParseEntry:
"""Process a single IFEval entry."""
# Extract fields from the row
key = row["key"]
raw_question = row["prompt"] # The prompt is the raw question in this case
instruction_id_list = row["instruction_id_list"]
kwargs_data = row["kwargs"]
# For IFEval, we don't have explicit answers in the dataset
# We'll use empty strings as placeholders
answer = ""
raw_answer = ""
question = str(raw_question)
# Use task_name if provided, otherwise use default
task = task_name or self._get_current_task(row)
return IFEvalParseEntry.create(
question=question,
answer=answer,
raw_question=raw_question,
raw_answer=raw_answer,
key=key,
instruction_id_list=instruction_id_list,
kwargs=kwargs_data,
task_name=task,
)
def get_dataset_description(self) -> DatasetDescription:
"""Returns description of the IFEval dataset."""
return DatasetDescription.create(
name="IFEval",
purpose="Evaluate instruction following capabilities through verifiable instructions",
source="Google Research",
language="English (BCP-47 en)",
format="Verifiable instruction prompts with automated evaluation criteria",
category=["Programming"],
characteristics=(
"Collection of approximately 500 verifiable instructions designed to evaluate "
"language models' instruction-following capabilities. Instructions include "
"specific, measurable criteria like 'write in more than 400 words' or "
"'mention the keyword AI at least 3 times' that can be verified through "
"automated heuristics. Used as a core benchmark in the Open LLM Leaderboard "
"for evaluating chat or instruction fine-tuned language models."
),
citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
title={Instruction-Following Evaluation for Large Language Models},
author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
year={2023},
eprint={2311.07911},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2311.07911}
}""",
)
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
"""Returns recommended evaluation metrics for IFEval."""
return [
EvaluationMetric.create(
name="format_compliance",
type="text",
description="Verifies if the output follows specified formatting rules (e.g., highlighting, bullet points, sections)",
implementation="custom_format_checker",
primary=True,
),
EvaluationMetric.create(
name="length_constraints",
type="text",
description="Checks if the response meets word, sentence, or paragraph count requirements",
implementation="custom_length_validator",
primary=True,
),
EvaluationMetric.create(
name="punctuation_rules",
type="text",
description="Validates adherence to punctuation constraints (e.g., no commas, specific endings)",
implementation="custom_punctuation_checker",
primary=True,
),
EvaluationMetric.create(
name="keyword_usage",
type="text",
description="Verifies correct usage of required keywords or avoidance of forbidden words",
implementation="custom_keyword_validator",
primary=False,
),
EvaluationMetric.create(
name="structural_requirements",
type="text",
description="Checks for specific structural elements like sections, paragraphs, or formatting patterns",
implementation="custom_structure_validator",
primary=False,
),
]
if __name__ == "__main__":
# Example usage
parser = IFEvalDatasetParser()
parser.load()
parser.parse()
parsed_data = parser.get_parsed_data
if parsed_data:
example = parsed_data[0]
print("\nExample parsed entry:")
print(f"Key: {example.key}")
print(f"Question: {example.question}")
print(f"Instruction IDs: {example.instruction_id_list}")
print(f"kwargs: {example.kwargs}")
|