Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Running

File size: 8,667 Bytes

import pytest

from llmdataparser.math_parser import MATHDatasetParser, MATHParseEntry


@pytest.fixture
def math_parser():
    """Create a MATH parser instance for testing."""
    return MATHDatasetParser()


@pytest.fixture
def loaded_math_parser(math_parser):
    """Create and load a MATH parser instance with test split."""
    math_parser.load(task_name="algebra", split="test")
    return math_parser


@pytest.fixture
def sample_math_entries():
    """Create sample MATH dataset entries for testing."""
    return [
        {
            "problem": "Solve for x: 2x + 4 = 10",
            "level": "Level 3",
            "solution": "Let's solve step by step:\n1) Subtract 4 from both sides: 2x = 6\n2) Divide both sides by 2\n\nTherefore, x = 3",
            "type": "algebra",
        },
        {
            "problem": "Find the area of a circle with radius 5 units.",
            "level": "Level 2",
            "solution": "Area = πr²\nArea = π(5)²\nArea = 25π square units",
            "type": "geometry",
        },
        {
            "problem": "What is the limit of (x²-1)/(x-1) as x approaches 1?",
            "level": "Level 4",
            "solution": "Using L'Hôpital's rule:\nlim(x→1) (x²-1)/(x-1) = lim(x→1) (2x)/(1) = 2",
            "type": "calculus",
        },
    ]


def test_math_parse_entry_creation_valid():
    """Test valid creation of MATHParseEntry with all fields."""
    entry = MATHParseEntry.create(
        question="Test question",
        answer="Test answer",
        raw_question="Test question",
        raw_answer="Test solution",
        level="Level 5",
        task_name="algebra",
        solution="Test solution",
    )

    assert isinstance(entry, MATHParseEntry)
    assert entry.question == "Test question"
    assert entry.answer == "Test answer"
    assert entry.raw_question == "Test question"
    assert entry.raw_answer == "Test solution"
    assert entry.level == "Level 5"
    assert entry.task_name == "algebra"
    assert entry.solution == "Test solution"


@pytest.mark.parametrize(
    "test_case",
    [
        {
            "problem": "Solve for x: 2x + 4 = 10",
            "level": "Level 3",
            "solution": "x = 3",
            "type": "algebra",
        },
        {
            "problem": "Find the derivative of f(x) = x²",
            "level": "Level 4",
            "solution": "f'(x) = 2x",
            "type": "calculus",
        },
    ],
)
def test_process_entry(math_parser, test_case):
    """Test processing different types of MATH entries."""
    entry = math_parser.process_entry(test_case, task_name=test_case["type"])

    assert isinstance(entry, MATHParseEntry)

    assert entry.answer == test_case["solution"]
    assert entry.raw_question == test_case["problem"]
    assert entry.raw_answer == test_case["solution"]
    assert entry.level == test_case["level"]
    assert entry.task_name == test_case["type"]
    assert entry.solution == test_case["solution"]


def test_math_parser_initialization(math_parser):
    """Test MATH parser initialization and properties."""
    assert isinstance(math_parser.task_names, list)
    assert len(math_parser.task_names) == 8
    assert math_parser._data_source == "lighteval/MATH"
    assert math_parser._default_task == "all"
    assert "algebra" in math_parser.task_names
    assert "geometry" in math_parser.task_names
    assert (
        math_parser.get_huggingface_link
        == "https://huggingface.co/datasets/lighteval/MATH"
    )


def test_get_current_task(math_parser):
    """Test task name resolution in different scenarios."""
    # Test with valid type in data entry
    test_row_with_type = {"type": "algebra"}
    assert math_parser._get_current_task(test_row_with_type) == "algebra"

    # Test without type in data entry
    test_row_without_type = {}
    math_parser._current_task = "geometry"
    assert math_parser._get_current_task(test_row_without_type) == "geometry"

    # Test with invalid type - should return current task
    test_row_invalid_type = {"type": "invalid_type"}
    math_parser._current_task = "algebra"
    assert math_parser._get_current_task(test_row_invalid_type) == "algebra"


def test_valid_levels(math_parser):
    """Test handling of valid level values."""
    for i in range(1, 6):
        test_row = {
            "problem": "Test problem",
            "level": f"Level {i}",
            "solution": "Test solution",
            "type": "algebra",
        }
        entry = math_parser.process_entry(test_row, task_name="algebra")
        assert entry.level == f"Level {i}"


@pytest.mark.parametrize(
    "invalid_level",
    [
        "Level 0",  # Too low
        "Level 6",  # Too high
        "Invalid",  # Wrong format
        None,  # Missing
        "",  # Empty
        "level 1",  # Wrong capitalization
    ],
)
def test_invalid_level_handling(math_parser, invalid_level):
    """Test handling of invalid level values."""
    test_row = {
        "problem": "Test problem",
        "level": invalid_level,
        "solution": "Test solution",
        "type": "algebra",
    }

    entry = math_parser.process_entry(test_row, task_name="algebra")
    assert entry.level == "Unknown"


@pytest.mark.integration
def test_load_dataset(loaded_math_parser):
    """Test loading the MATH dataset."""
    assert loaded_math_parser.raw_data is not None
    assert loaded_math_parser.split_names == ["test"]
    assert loaded_math_parser._current_task == "algebra"


def test_parser_string_representation(loaded_math_parser):
    """Test string representation of MATH parser."""
    repr_str = str(loaded_math_parser)
    assert "MATHDatasetParser" in repr_str
    assert "lighteval/MATH" in repr_str
    assert "algebra" in repr_str
    assert "loaded" in repr_str


@pytest.mark.integration
def test_different_splits_parsing(math_parser):
    """Test parsing different splits of the dataset."""
    # Load and parse test split
    math_parser.load(task_name="algebra", split="test")
    math_parser.parse(split_names="test", force=True)
    test_count = len(math_parser.get_parsed_data)

    # Load and parse train split
    math_parser.load(task_name="algebra", split="train")
    math_parser.parse(split_names="train", force=True)
    train_count = len(math_parser.get_parsed_data)

    assert test_count > 0
    assert train_count > 0
    assert train_count != test_count


def test_get_dataset_description(math_parser):
    """Test dataset description generation."""
    description = math_parser.get_dataset_description()

    assert description.name == "MATH"
    assert "Hendrycks" in description.source
    assert description.language == "English"
    assert "12,500" in description.characteristics
    assert "hendrycksmath2021" in description.citation
    assert "NeurIPS" in description.citation

    # Check additional info
    assert description.additional_info is not None
    assert description.additional_info["difficulty_levels"] == "1-5"
    assert "algebra" in description.additional_info["topics"]
    assert "geometry" in description.additional_info["topics"]
    assert description.additional_info["size"] == "12,500 problems"


def test_get_evaluation_metrics(math_parser):
    """Test evaluation metrics generation."""
    metrics = math_parser.get_evaluation_metrics()

    # Check total number of metrics
    assert len(metrics) == 5

    # Check primary metrics
    primary_metrics = [m for m in metrics if m.primary]
    assert len(primary_metrics) == 3

    # Verify specific metrics exist with correct properties
    metric_names = {m.name for m in metrics}
    assert "symbolic_equivalence" in metric_names
    assert "solution_presence" in metric_names
    assert "reasoning_validity" in metric_names
    assert "mathematical_notation" in metric_names
    assert "solution_clarity" in metric_names

    # Check specific metric properties
    symbolic_metric = next(m for m in metrics if m.name == "symbolic_equivalence")
    assert symbolic_metric.type == "exact_match"
    assert symbolic_metric.primary is True
    assert "sympy" in symbolic_metric.description.lower()
    assert "equivalence" in symbolic_metric.description.lower()

    solution_metric = next(m for m in metrics if m.name == "solution_presence")
    assert solution_metric.type == "text"
    assert solution_metric.primary is True
    assert "step-by-step" in solution_metric.description.lower()

    reasoning_metric = next(m for m in metrics if m.name == "reasoning_validity")
    assert reasoning_metric.type == "text"
    assert reasoning_metric.primary is True
    assert "mathematical reasoning" in reasoning_metric.description.lower()