LLMEval-Dataset-Parser / tests /test_humaneval_parser.py
JeffYang52415's picture
refactor: remove system prompt
0450c4e unverified
raw
history blame
6.64 kB
import pytest
from llmdataparser.humaneval_parser import (
HumanEvalDatasetParser,
HumanEvalDatasetPlusParser,
HumanEvalParseEntry,
)
@pytest.fixture
def sample_entry():
return {
"prompt": 'def add(a, b):\n """Add two numbers."""\n',
"canonical_solution": "def add(a, b):\n return a + b\n",
"task_id": "HumanEval/0",
"entry_point": "add",
"test": "def test_add(): assert add(2, 3) == 5",
}
@pytest.fixture
def parser():
return HumanEvalDatasetParser()
@pytest.fixture
def plus_parser():
return HumanEvalDatasetPlusParser()
@pytest.fixture
def plus_sample_entry():
return {
"prompt": 'def add(a, b):\n """Add two numbers."""\n',
"canonical_solution": "def add(a, b):\n return a + b\n",
"task_id": "HumanEval/0",
"entry_point": "add",
"test": "def test_add(): assert add(2, 3) == 5",
}
def test_humaneval_parse_entry_creation():
"""Test creation of HumanEvalParseEntry"""
entry = HumanEvalParseEntry.create(
question="test question",
answer="test answer",
raw_question="raw question",
task_id="HumanEval/1",
entry_point="test_func",
test="test case",
task_name="openai_humaneval",
)
assert entry.question == "test question"
assert entry.answer == "test answer"
assert entry.raw_question == "raw question"
assert entry.raw_answer == "test answer" # Should match answer
assert entry.task_id == "HumanEval/1"
assert entry.entry_point == "test_func"
assert entry.test == "test case"
assert entry.task_name == "openai_humaneval"
def test_humaneval_parse_entry_validation():
"""Test validation of required fields"""
with pytest.raises(ValueError, match="Task ID cannot be empty"):
HumanEvalParseEntry.create(
question="test",
answer="test",
raw_question="test",
task_id="", # Empty task_id should raise error
entry_point="test",
test="test",
task_name="test",
)
with pytest.raises(ValueError, match="Entry point cannot be empty"):
HumanEvalParseEntry.create(
question="test",
answer="test",
raw_question="test",
task_id="test",
entry_point="", # Empty entry_point should raise error
test="test",
task_name="test",
)
def test_process_entry(parser, sample_entry):
"""Test processing of a single entry"""
result = parser.process_entry(sample_entry, task_name="openai_humaneval")
assert isinstance(result, HumanEvalParseEntry)
assert result.task_id == "HumanEval/0"
assert result.entry_point == "add"
assert result.answer == sample_entry["canonical_solution"]
assert result.test == sample_entry["test"]
assert result.task_name == "openai_humaneval"
def test_parser_initialization(parser):
"""Test parser initialization and properties"""
assert parser._data_source == "openai/openai_humaneval"
assert parser._default_task == "openai_humaneval"
assert parser._task_names == ["openai_humaneval"]
assert (
parser.get_huggingface_link
== "https://huggingface.co/datasets/openai/openai_humaneval"
)
@pytest.mark.integration
def test_parser_load_and_parse(parser):
"""Integration test for loading and parsing data"""
parser.load()
parser.parse()
parsed_data = parser.get_parsed_data
assert len(parsed_data) > 0
assert all(isinstance(entry, HumanEvalParseEntry) for entry in parsed_data)
def test_get_current_task(parser, sample_entry):
"""Test _get_current_task method"""
task = parser._get_current_task(sample_entry)
assert task == parser._default_task
def test_plus_parser_initialization(plus_parser):
"""Test HumanEvalDatasetPlusParser initialization and properties"""
assert plus_parser._data_source == "evalplus/humanevalplus"
assert plus_parser._default_task == "default"
assert plus_parser._task_names == ["default"]
assert (
plus_parser.get_huggingface_link
== "https://huggingface.co/datasets/evalplus/humanevalplus"
)
def test_plus_process_entry(plus_parser, plus_sample_entry):
"""Test processing of a single entry in HumanEvalDatasetPlusParser"""
result = plus_parser.process_entry(plus_sample_entry, task_name="default")
assert isinstance(result, HumanEvalParseEntry)
assert result.task_id == "HumanEval/0"
assert result.entry_point == "add"
assert result.answer == plus_sample_entry["canonical_solution"]
assert result.test == plus_sample_entry["test"]
assert result.task_name == "default"
@pytest.mark.integration
def test_plus_parser_load_and_parse(plus_parser):
"""Integration test for loading and parsing data with HumanEvalDatasetPlusParser"""
plus_parser.load()
plus_parser.parse()
parsed_data = plus_parser.get_parsed_data
assert len(parsed_data) > 0
assert all(isinstance(entry, HumanEvalParseEntry) for entry in parsed_data)
def test_plus_get_current_task(plus_parser, plus_sample_entry):
"""Test _get_current_task method for HumanEvalDatasetPlusParser"""
task = plus_parser._get_current_task(plus_sample_entry)
assert task == plus_parser._default_task
def test_get_dataset_description(parser, plus_parser):
"""Test dataset description generation for both parsers."""
# Test original HumanEval description
description = parser.get_dataset_description()
assert description.name == "HumanEval"
assert "code generation" in description.purpose
assert description.language == "Python"
assert "chen2021codex" in description.citation
# Test HumanEval Plus description
plus_description = plus_parser.get_dataset_description()
assert plus_description.name == "HumanEval Plus"
assert "80x more test coverage" in plus_description.purpose
assert "comprehensive test suites" in plus_description.format
assert "edge cases" in plus_description.characteristics
assert "evalplus" in plus_description.citation
def test_get_evaluation_metrics(parser):
"""Test evaluation metrics generation for both parsers."""
# Test original HumanEval metrics
metrics = parser.get_evaluation_metrics()
assert len(metrics) == 5 # Base metrics + 2 specific metrics
# Check primary metrics - update to match actual implementation
primary_metrics = [m for m in metrics if m.primary]
assert len(primary_metrics) == 1 # pass@k
assert any(m.name == "pass@k" for m in primary_metrics)