|
import pytest |
|
|
|
from llmdataparser.math_parser import MATHDatasetParser, MATHParseEntry |
|
|
|
|
|
@pytest.fixture |
|
def math_parser(): |
|
"""Create a MATH parser instance for testing.""" |
|
return MATHDatasetParser() |
|
|
|
|
|
@pytest.fixture |
|
def loaded_math_parser(math_parser): |
|
"""Create and load a MATH parser instance with test split.""" |
|
math_parser.load(task_name="algebra", split="test") |
|
return math_parser |
|
|
|
|
|
@pytest.fixture |
|
def sample_math_entries(): |
|
"""Create sample MATH dataset entries for testing.""" |
|
return [ |
|
{ |
|
"problem": "Solve for x: 2x + 4 = 10", |
|
"level": "Level 3", |
|
"solution": "Let's solve step by step:\n1) Subtract 4 from both sides: 2x = 6\n2) Divide both sides by 2\n\nTherefore, x = 3", |
|
"type": "algebra", |
|
}, |
|
{ |
|
"problem": "Find the area of a circle with radius 5 units.", |
|
"level": "Level 2", |
|
"solution": "Area = πr²\nArea = π(5)²\nArea = 25π square units", |
|
"type": "geometry", |
|
}, |
|
{ |
|
"problem": "What is the limit of (x²-1)/(x-1) as x approaches 1?", |
|
"level": "Level 4", |
|
"solution": "Using L'Hôpital's rule:\nlim(x→1) (x²-1)/(x-1) = lim(x→1) (2x)/(1) = 2", |
|
"type": "calculus", |
|
}, |
|
] |
|
|
|
|
|
def test_math_parse_entry_creation_valid(): |
|
"""Test valid creation of MATHParseEntry with all fields.""" |
|
entry = MATHParseEntry.create( |
|
question="Test question", |
|
answer="Test answer", |
|
raw_question="Test question", |
|
raw_answer="Test solution", |
|
level="Level 5", |
|
task_name="algebra", |
|
solution="Test solution", |
|
) |
|
|
|
assert isinstance(entry, MATHParseEntry) |
|
assert entry.question == "Test question" |
|
assert entry.answer == "Test answer" |
|
assert entry.raw_question == "Test question" |
|
assert entry.raw_answer == "Test solution" |
|
assert entry.level == "Level 5" |
|
assert entry.task_name == "algebra" |
|
assert entry.solution == "Test solution" |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"test_case", |
|
[ |
|
{ |
|
"problem": "Solve for x: 2x + 4 = 10", |
|
"level": "Level 3", |
|
"solution": "x = 3", |
|
"type": "algebra", |
|
}, |
|
{ |
|
"problem": "Find the derivative of f(x) = x²", |
|
"level": "Level 4", |
|
"solution": "f'(x) = 2x", |
|
"type": "calculus", |
|
}, |
|
], |
|
) |
|
def test_process_entry(math_parser, test_case): |
|
"""Test processing different types of MATH entries.""" |
|
entry = math_parser.process_entry(test_case, task_name=test_case["type"]) |
|
|
|
assert isinstance(entry, MATHParseEntry) |
|
|
|
assert entry.answer == test_case["solution"] |
|
assert entry.raw_question == test_case["problem"] |
|
assert entry.raw_answer == test_case["solution"] |
|
assert entry.level == test_case["level"] |
|
assert entry.task_name == test_case["type"] |
|
assert entry.solution == test_case["solution"] |
|
|
|
|
|
def test_math_parser_initialization(math_parser): |
|
"""Test MATH parser initialization and properties.""" |
|
assert isinstance(math_parser.task_names, list) |
|
assert len(math_parser.task_names) == 8 |
|
assert math_parser._data_source == "lighteval/MATH" |
|
assert math_parser._default_task == "all" |
|
assert "algebra" in math_parser.task_names |
|
assert "geometry" in math_parser.task_names |
|
assert ( |
|
math_parser.get_huggingface_link |
|
== "https://huggingface.co/datasets/lighteval/MATH" |
|
) |
|
|
|
|
|
def test_get_current_task(math_parser): |
|
"""Test task name resolution in different scenarios.""" |
|
|
|
test_row_with_type = {"type": "algebra"} |
|
assert math_parser._get_current_task(test_row_with_type) == "algebra" |
|
|
|
|
|
test_row_without_type = {} |
|
math_parser._current_task = "geometry" |
|
assert math_parser._get_current_task(test_row_without_type) == "geometry" |
|
|
|
|
|
test_row_invalid_type = {"type": "invalid_type"} |
|
math_parser._current_task = "algebra" |
|
assert math_parser._get_current_task(test_row_invalid_type) == "algebra" |
|
|
|
|
|
def test_valid_levels(math_parser): |
|
"""Test handling of valid level values.""" |
|
for i in range(1, 6): |
|
test_row = { |
|
"problem": "Test problem", |
|
"level": f"Level {i}", |
|
"solution": "Test solution", |
|
"type": "algebra", |
|
} |
|
entry = math_parser.process_entry(test_row, task_name="algebra") |
|
assert entry.level == f"Level {i}" |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"invalid_level", |
|
[ |
|
"Level 0", |
|
"Level 6", |
|
"Invalid", |
|
None, |
|
"", |
|
"level 1", |
|
], |
|
) |
|
def test_invalid_level_handling(math_parser, invalid_level): |
|
"""Test handling of invalid level values.""" |
|
test_row = { |
|
"problem": "Test problem", |
|
"level": invalid_level, |
|
"solution": "Test solution", |
|
"type": "algebra", |
|
} |
|
|
|
entry = math_parser.process_entry(test_row, task_name="algebra") |
|
assert entry.level == "Unknown" |
|
|
|
|
|
@pytest.mark.integration |
|
def test_load_dataset(loaded_math_parser): |
|
"""Test loading the MATH dataset.""" |
|
assert loaded_math_parser.raw_data is not None |
|
assert loaded_math_parser.split_names == ["test"] |
|
assert loaded_math_parser._current_task == "algebra" |
|
|
|
|
|
def test_parser_string_representation(loaded_math_parser): |
|
"""Test string representation of MATH parser.""" |
|
repr_str = str(loaded_math_parser) |
|
assert "MATHDatasetParser" in repr_str |
|
assert "lighteval/MATH" in repr_str |
|
assert "algebra" in repr_str |
|
assert "loaded" in repr_str |
|
|
|
|
|
@pytest.mark.integration |
|
def test_different_splits_parsing(math_parser): |
|
"""Test parsing different splits of the dataset.""" |
|
|
|
math_parser.load(task_name="algebra", split="test") |
|
math_parser.parse(split_names="test", force=True) |
|
test_count = len(math_parser.get_parsed_data) |
|
|
|
|
|
math_parser.load(task_name="algebra", split="train") |
|
math_parser.parse(split_names="train", force=True) |
|
train_count = len(math_parser.get_parsed_data) |
|
|
|
assert test_count > 0 |
|
assert train_count > 0 |
|
assert train_count != test_count |
|
|
|
|
|
def test_get_dataset_description(math_parser): |
|
"""Test dataset description generation.""" |
|
description = math_parser.get_dataset_description() |
|
|
|
assert description.name == "MATH" |
|
assert "Hendrycks" in description.source |
|
assert description.language == "English" |
|
assert "12,500" in description.characteristics |
|
assert "hendrycksmath2021" in description.citation |
|
assert "NeurIPS" in description.citation |
|
|
|
|
|
assert description.additional_info is not None |
|
assert description.additional_info["difficulty_levels"] == "1-5" |
|
assert "algebra" in description.additional_info["topics"] |
|
assert "geometry" in description.additional_info["topics"] |
|
assert description.additional_info["size"] == "12,500 problems" |
|
|
|
|
|
def test_get_evaluation_metrics(math_parser): |
|
"""Test evaluation metrics generation.""" |
|
metrics = math_parser.get_evaluation_metrics() |
|
|
|
|
|
assert len(metrics) == 5 |
|
|
|
|
|
primary_metrics = [m for m in metrics if m.primary] |
|
assert len(primary_metrics) == 3 |
|
|
|
|
|
metric_names = {m.name for m in metrics} |
|
assert "symbolic_equivalence" in metric_names |
|
assert "solution_presence" in metric_names |
|
assert "reasoning_validity" in metric_names |
|
assert "mathematical_notation" in metric_names |
|
assert "solution_clarity" in metric_names |
|
|
|
|
|
symbolic_metric = next(m for m in metrics if m.name == "symbolic_equivalence") |
|
assert symbolic_metric.type == "exact_match" |
|
assert symbolic_metric.primary is True |
|
assert "sympy" in symbolic_metric.description.lower() |
|
assert "equivalence" in symbolic_metric.description.lower() |
|
|
|
solution_metric = next(m for m in metrics if m.name == "solution_presence") |
|
assert solution_metric.type == "text" |
|
assert solution_metric.primary is True |
|
assert "step-by-step" in solution_metric.description.lower() |
|
|
|
reasoning_metric = next(m for m in metrics if m.name == "reasoning_validity") |
|
assert reasoning_metric.type == "text" |
|
assert reasoning_metric.primary is True |
|
assert "mathematical reasoning" in reasoning_metric.description.lower() |
|
|