Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Running

File size: 6,160 Bytes

from dataclasses import dataclass
from typing import Any, ClassVar, List

from llmdataparser.base_parser import (
    DatasetDescription,
    EvaluationMetric,
    HuggingFaceDatasetParser,
    HuggingFaceParseEntry,
)


@dataclass(frozen=True, kw_only=True, slots=True)
class IFEvalParseEntry(HuggingFaceParseEntry):
    """Custom entry class for IFEval, with fields specific to this dataset parser."""

    key: int
    instruction_id_list: List[str]
    kwargs: dict[str, Any]

    @classmethod
    def create(
        cls,
        question: str,
        answer: str,
        raw_question: str,
        raw_answer: str,
        key: int,
        instruction_id_list: List[str],
        kwargs: dict[str, Any],
        task_name: str,
    ) -> "IFEvalParseEntry":
        return cls(
            question=question,
            answer=answer,
            raw_question=raw_question,
            raw_answer=raw_answer,
            key=key,
            instruction_id_list=instruction_id_list,
            kwargs=kwargs,
            task_name=task_name,
        )


class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
    """Parser for the IFEval dataset."""

    _data_source: ClassVar[str] = "google/IFEval"
    _default_task: ClassVar[str] = "default"
    _task_names: ClassVar[list[str]] = ["default"]

    def process_entry(
        self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
    ) -> IFEvalParseEntry:
        """Process a single IFEval entry."""
        # Extract fields from the row
        key = row["key"]
        raw_question = row["prompt"]  # The prompt is the raw question in this case
        instruction_id_list = row["instruction_id_list"]
        kwargs_data = row["kwargs"]

        # For IFEval, we don't have explicit answers in the dataset
        # We'll use empty strings as placeholders
        answer = ""
        raw_answer = ""

        question = str(raw_question)

        # Use task_name if provided, otherwise use default
        task = task_name or self._get_current_task(row)

        return IFEvalParseEntry.create(
            question=question,
            answer=answer,
            raw_question=raw_question,
            raw_answer=raw_answer,
            key=key,
            instruction_id_list=instruction_id_list,
            kwargs=kwargs_data,
            task_name=task,
        )

    def get_dataset_description(self) -> DatasetDescription:
        """Returns description of the IFEval dataset."""
        return DatasetDescription.create(
            name="IFEval",
            purpose="Evaluate instruction following capabilities through verifiable instructions",
            source="Google Research",
            language="English (BCP-47 en)",
            format="Verifiable instruction prompts with automated evaluation criteria",
            category=["Programming"],
            characteristics=(
                "Collection of approximately 500 verifiable instructions designed to evaluate "
                "language models' instruction-following capabilities. Instructions include "
                "specific, measurable criteria like 'write in more than 400 words' or "
                "'mention the keyword AI at least 3 times' that can be verified through "
                "automated heuristics. Used as a core benchmark in the Open LLM Leaderboard "
                "for evaluating chat or instruction fine-tuned language models."
            ),
            citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
    title={Instruction-Following Evaluation for Large Language Models},
    author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
    year={2023},
    eprint={2311.07911},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2311.07911}
    }""",
        )

    def get_evaluation_metrics(self) -> list[EvaluationMetric]:
        """Returns recommended evaluation metrics for IFEval."""
        return [
            EvaluationMetric.create(
                name="format_compliance",
                type="text",
                description="Verifies if the output follows specified formatting rules (e.g., highlighting, bullet points, sections)",
                implementation="custom_format_checker",
                primary=True,
            ),
            EvaluationMetric.create(
                name="length_constraints",
                type="text",
                description="Checks if the response meets word, sentence, or paragraph count requirements",
                implementation="custom_length_validator",
                primary=True,
            ),
            EvaluationMetric.create(
                name="punctuation_rules",
                type="text",
                description="Validates adherence to punctuation constraints (e.g., no commas, specific endings)",
                implementation="custom_punctuation_checker",
                primary=True,
            ),
            EvaluationMetric.create(
                name="keyword_usage",
                type="text",
                description="Verifies correct usage of required keywords or avoidance of forbidden words",
                implementation="custom_keyword_validator",
                primary=False,
            ),
            EvaluationMetric.create(
                name="structural_requirements",
                type="text",
                description="Checks for specific structural elements like sections, paragraphs, or formatting patterns",
                implementation="custom_structure_validator",
                primary=False,
            ),
        ]


if __name__ == "__main__":
    # Example usage
    parser = IFEvalDatasetParser()
    parser.load()
    parser.parse()

    parsed_data = parser.get_parsed_data
    if parsed_data:
        example = parsed_data[0]
        print("\nExample parsed entry:")
        print(f"Key: {example.key}")
        print(f"Question: {example.question}")
        print(f"Instruction IDs: {example.instruction_id_list}")
        print(f"kwargs: {example.kwargs}")