LLMEval-Dataset-Parser / tests /test_math_parser.py
JeffYang52415's picture
refactor: remove system prompt
0450c4e unverified
raw
history blame
8.67 kB
import pytest
from llmdataparser.math_parser import MATHDatasetParser, MATHParseEntry
@pytest.fixture
def math_parser():
"""Create a MATH parser instance for testing."""
return MATHDatasetParser()
@pytest.fixture
def loaded_math_parser(math_parser):
"""Create and load a MATH parser instance with test split."""
math_parser.load(task_name="algebra", split="test")
return math_parser
@pytest.fixture
def sample_math_entries():
"""Create sample MATH dataset entries for testing."""
return [
{
"problem": "Solve for x: 2x + 4 = 10",
"level": "Level 3",
"solution": "Let's solve step by step:\n1) Subtract 4 from both sides: 2x = 6\n2) Divide both sides by 2\n\nTherefore, x = 3",
"type": "algebra",
},
{
"problem": "Find the area of a circle with radius 5 units.",
"level": "Level 2",
"solution": "Area = πr²\nArea = π(5)²\nArea = 25π square units",
"type": "geometry",
},
{
"problem": "What is the limit of (x²-1)/(x-1) as x approaches 1?",
"level": "Level 4",
"solution": "Using L'Hôpital's rule:\nlim(x→1) (x²-1)/(x-1) = lim(x→1) (2x)/(1) = 2",
"type": "calculus",
},
]
def test_math_parse_entry_creation_valid():
"""Test valid creation of MATHParseEntry with all fields."""
entry = MATHParseEntry.create(
question="Test question",
answer="Test answer",
raw_question="Test question",
raw_answer="Test solution",
level="Level 5",
task_name="algebra",
solution="Test solution",
)
assert isinstance(entry, MATHParseEntry)
assert entry.question == "Test question"
assert entry.answer == "Test answer"
assert entry.raw_question == "Test question"
assert entry.raw_answer == "Test solution"
assert entry.level == "Level 5"
assert entry.task_name == "algebra"
assert entry.solution == "Test solution"
@pytest.mark.parametrize(
"test_case",
[
{
"problem": "Solve for x: 2x + 4 = 10",
"level": "Level 3",
"solution": "x = 3",
"type": "algebra",
},
{
"problem": "Find the derivative of f(x) = x²",
"level": "Level 4",
"solution": "f'(x) = 2x",
"type": "calculus",
},
],
)
def test_process_entry(math_parser, test_case):
"""Test processing different types of MATH entries."""
entry = math_parser.process_entry(test_case, task_name=test_case["type"])
assert isinstance(entry, MATHParseEntry)
assert entry.answer == test_case["solution"]
assert entry.raw_question == test_case["problem"]
assert entry.raw_answer == test_case["solution"]
assert entry.level == test_case["level"]
assert entry.task_name == test_case["type"]
assert entry.solution == test_case["solution"]
def test_math_parser_initialization(math_parser):
"""Test MATH parser initialization and properties."""
assert isinstance(math_parser.task_names, list)
assert len(math_parser.task_names) == 8
assert math_parser._data_source == "lighteval/MATH"
assert math_parser._default_task == "all"
assert "algebra" in math_parser.task_names
assert "geometry" in math_parser.task_names
assert (
math_parser.get_huggingface_link
== "https://huggingface.co/datasets/lighteval/MATH"
)
def test_get_current_task(math_parser):
"""Test task name resolution in different scenarios."""
# Test with valid type in data entry
test_row_with_type = {"type": "algebra"}
assert math_parser._get_current_task(test_row_with_type) == "algebra"
# Test without type in data entry
test_row_without_type = {}
math_parser._current_task = "geometry"
assert math_parser._get_current_task(test_row_without_type) == "geometry"
# Test with invalid type - should return current task
test_row_invalid_type = {"type": "invalid_type"}
math_parser._current_task = "algebra"
assert math_parser._get_current_task(test_row_invalid_type) == "algebra"
def test_valid_levels(math_parser):
"""Test handling of valid level values."""
for i in range(1, 6):
test_row = {
"problem": "Test problem",
"level": f"Level {i}",
"solution": "Test solution",
"type": "algebra",
}
entry = math_parser.process_entry(test_row, task_name="algebra")
assert entry.level == f"Level {i}"
@pytest.mark.parametrize(
"invalid_level",
[
"Level 0", # Too low
"Level 6", # Too high
"Invalid", # Wrong format
None, # Missing
"", # Empty
"level 1", # Wrong capitalization
],
)
def test_invalid_level_handling(math_parser, invalid_level):
"""Test handling of invalid level values."""
test_row = {
"problem": "Test problem",
"level": invalid_level,
"solution": "Test solution",
"type": "algebra",
}
entry = math_parser.process_entry(test_row, task_name="algebra")
assert entry.level == "Unknown"
@pytest.mark.integration
def test_load_dataset(loaded_math_parser):
"""Test loading the MATH dataset."""
assert loaded_math_parser.raw_data is not None
assert loaded_math_parser.split_names == ["test"]
assert loaded_math_parser._current_task == "algebra"
def test_parser_string_representation(loaded_math_parser):
"""Test string representation of MATH parser."""
repr_str = str(loaded_math_parser)
assert "MATHDatasetParser" in repr_str
assert "lighteval/MATH" in repr_str
assert "algebra" in repr_str
assert "loaded" in repr_str
@pytest.mark.integration
def test_different_splits_parsing(math_parser):
"""Test parsing different splits of the dataset."""
# Load and parse test split
math_parser.load(task_name="algebra", split="test")
math_parser.parse(split_names="test", force=True)
test_count = len(math_parser.get_parsed_data)
# Load and parse train split
math_parser.load(task_name="algebra", split="train")
math_parser.parse(split_names="train", force=True)
train_count = len(math_parser.get_parsed_data)
assert test_count > 0
assert train_count > 0
assert train_count != test_count
def test_get_dataset_description(math_parser):
"""Test dataset description generation."""
description = math_parser.get_dataset_description()
assert description.name == "MATH"
assert "Hendrycks" in description.source
assert description.language == "English"
assert "12,500" in description.characteristics
assert "hendrycksmath2021" in description.citation
assert "NeurIPS" in description.citation
# Check additional info
assert description.additional_info is not None
assert description.additional_info["difficulty_levels"] == "1-5"
assert "algebra" in description.additional_info["topics"]
assert "geometry" in description.additional_info["topics"]
assert description.additional_info["size"] == "12,500 problems"
def test_get_evaluation_metrics(math_parser):
"""Test evaluation metrics generation."""
metrics = math_parser.get_evaluation_metrics()
# Check total number of metrics
assert len(metrics) == 5
# Check primary metrics
primary_metrics = [m for m in metrics if m.primary]
assert len(primary_metrics) == 3
# Verify specific metrics exist with correct properties
metric_names = {m.name for m in metrics}
assert "symbolic_equivalence" in metric_names
assert "solution_presence" in metric_names
assert "reasoning_validity" in metric_names
assert "mathematical_notation" in metric_names
assert "solution_clarity" in metric_names
# Check specific metric properties
symbolic_metric = next(m for m in metrics if m.name == "symbolic_equivalence")
assert symbolic_metric.type == "exact_match"
assert symbolic_metric.primary is True
assert "sympy" in symbolic_metric.description.lower()
assert "equivalence" in symbolic_metric.description.lower()
solution_metric = next(m for m in metrics if m.name == "solution_presence")
assert solution_metric.type == "text"
assert solution_metric.primary is True
assert "step-by-step" in solution_metric.description.lower()
reasoning_metric = next(m for m in metrics if m.name == "reasoning_validity")
assert reasoning_metric.type == "text"
assert reasoning_metric.primary is True
assert "mathematical reasoning" in reasoning_metric.description.lower()