|
from dataclasses import dataclass |
|
from typing import Any, ClassVar, List |
|
|
|
from llmdataparser.base_parser import ( |
|
DatasetDescription, |
|
EvaluationMetric, |
|
HuggingFaceDatasetParser, |
|
HuggingFaceParseEntry, |
|
) |
|
|
|
|
|
@dataclass(frozen=True, kw_only=True, slots=True) |
|
class IFEvalParseEntry(HuggingFaceParseEntry): |
|
"""Custom entry class for IFEval, with fields specific to this dataset parser.""" |
|
|
|
key: int |
|
instruction_id_list: List[str] |
|
kwargs: dict[str, Any] |
|
|
|
@classmethod |
|
def create( |
|
cls, |
|
question: str, |
|
answer: str, |
|
raw_question: str, |
|
raw_answer: str, |
|
key: int, |
|
instruction_id_list: List[str], |
|
kwargs: dict[str, Any], |
|
task_name: str, |
|
) -> "IFEvalParseEntry": |
|
return cls( |
|
question=question, |
|
answer=answer, |
|
raw_question=raw_question, |
|
raw_answer=raw_answer, |
|
key=key, |
|
instruction_id_list=instruction_id_list, |
|
kwargs=kwargs, |
|
task_name=task_name, |
|
) |
|
|
|
|
|
class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]): |
|
"""Parser for the IFEval dataset.""" |
|
|
|
_data_source: ClassVar[str] = "google/IFEval" |
|
_default_task: ClassVar[str] = "default" |
|
_task_names: ClassVar[list[str]] = ["default"] |
|
|
|
def process_entry( |
|
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any |
|
) -> IFEvalParseEntry: |
|
"""Process a single IFEval entry.""" |
|
|
|
key = row["key"] |
|
raw_question = row["prompt"] |
|
instruction_id_list = row["instruction_id_list"] |
|
kwargs_data = row["kwargs"] |
|
|
|
|
|
|
|
answer = "" |
|
raw_answer = "" |
|
|
|
question = str(raw_question) |
|
|
|
|
|
task = task_name or self._get_current_task(row) |
|
|
|
return IFEvalParseEntry.create( |
|
question=question, |
|
answer=answer, |
|
raw_question=raw_question, |
|
raw_answer=raw_answer, |
|
key=key, |
|
instruction_id_list=instruction_id_list, |
|
kwargs=kwargs_data, |
|
task_name=task, |
|
) |
|
|
|
def get_dataset_description(self) -> DatasetDescription: |
|
"""Returns description of the IFEval dataset.""" |
|
return DatasetDescription.create( |
|
name="IFEval", |
|
purpose="Evaluate instruction following capabilities through verifiable instructions", |
|
source="Google Research", |
|
language="English (BCP-47 en)", |
|
format="Verifiable instruction prompts with automated evaluation criteria", |
|
category=["Programming"], |
|
characteristics=( |
|
"Collection of approximately 500 verifiable instructions designed to evaluate " |
|
"language models' instruction-following capabilities. Instructions include " |
|
"specific, measurable criteria like 'write in more than 400 words' or " |
|
"'mention the keyword AI at least 3 times' that can be verified through " |
|
"automated heuristics. Used as a core benchmark in the Open LLM Leaderboard " |
|
"for evaluating chat or instruction fine-tuned language models." |
|
), |
|
citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage, |
|
title={Instruction-Following Evaluation for Large Language Models}, |
|
author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou}, |
|
year={2023}, |
|
eprint={2311.07911}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL}, |
|
url={https://arxiv.org/abs/2311.07911} |
|
}""", |
|
) |
|
|
|
def get_evaluation_metrics(self) -> list[EvaluationMetric]: |
|
"""Returns recommended evaluation metrics for IFEval.""" |
|
return [ |
|
EvaluationMetric.create( |
|
name="format_compliance", |
|
type="text", |
|
description="Verifies if the output follows specified formatting rules (e.g., highlighting, bullet points, sections)", |
|
implementation="custom_format_checker", |
|
primary=True, |
|
), |
|
EvaluationMetric.create( |
|
name="length_constraints", |
|
type="text", |
|
description="Checks if the response meets word, sentence, or paragraph count requirements", |
|
implementation="custom_length_validator", |
|
primary=True, |
|
), |
|
EvaluationMetric.create( |
|
name="punctuation_rules", |
|
type="text", |
|
description="Validates adherence to punctuation constraints (e.g., no commas, specific endings)", |
|
implementation="custom_punctuation_checker", |
|
primary=True, |
|
), |
|
EvaluationMetric.create( |
|
name="keyword_usage", |
|
type="text", |
|
description="Verifies correct usage of required keywords or avoidance of forbidden words", |
|
implementation="custom_keyword_validator", |
|
primary=False, |
|
), |
|
EvaluationMetric.create( |
|
name="structural_requirements", |
|
type="text", |
|
description="Checks for specific structural elements like sections, paragraphs, or formatting patterns", |
|
implementation="custom_structure_validator", |
|
primary=False, |
|
), |
|
] |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
parser = IFEvalDatasetParser() |
|
parser.load() |
|
parser.parse() |
|
|
|
parsed_data = parser.get_parsed_data |
|
if parsed_data: |
|
example = parsed_data[0] |
|
print("\nExample parsed entry:") |
|
print(f"Key: {example.key}") |
|
print(f"Question: {example.question}") |
|
print(f"Instruction IDs: {example.instruction_id_list}") |
|
print(f"kwargs: {example.kwargs}") |
|
|