diff --git a/evaluate_gsm8k.py b/evaluate_gsm8k.py index 3826a32..3a90718 100644 --- a/evaluate_gsm8k.py +++ b/evaluate_gsm8k.py @@ -18,7 +18,7 @@ def parse_numeric(s: str): return None # Try fraction first - frac_match = re.search(r"(\d+)/(\d+)", s) + frac_match = re.search(r"(-?\d+)/(\d+)", s) if frac_match: try: return float(Fraction(int(frac_match.group(1)), int(frac_match.group(2)))) @@ -26,7 +26,7 @@ def parse_numeric(s: str): pass # Find decimals or integers - nums = re.findall(r"-?\d+\.?\d*", s) + nums = re.findall(r"-?\d*\.?\d+", s) if not nums: return None diff --git a/test_evaluate_gsm8k.py b/test_evaluate_gsm8k.py new file mode 100644 index 0000000..345f25f --- /dev/null +++ b/test_evaluate_gsm8k.py @@ -0,0 +1,57 @@ +import sys +from unittest.mock import MagicMock + +# Mock out heavy dependencies that might be missing in sandbox +sys.modules['datasets'] = MagicMock() +sys.modules['inference'] = MagicMock() + +import pytest +from evaluate_gsm8k import parse_numeric + +def test_parse_numeric_valid_integers(): + """Test parsing of valid positive and negative integers.""" + assert parse_numeric("42") == 42.0 + assert parse_numeric("-42") == -42.0 + assert parse_numeric("0") == 0.0 + +def test_parse_numeric_valid_decimals(): + """Test parsing of valid positive and negative decimal numbers.""" + assert parse_numeric("3.14") == 3.14 + assert parse_numeric("-0.5") == -0.5 + assert parse_numeric(".5") == 0.5 + assert parse_numeric("-.5") == -0.5 + +def test_parse_numeric_valid_fractions(): + """Test parsing of fractions, which are expected in GSM8K answers.""" + assert parse_numeric("1/2") == 0.5 + assert parse_numeric("-1/2") == -0.5 + assert parse_numeric("3/4") == 0.75 + +def test_parse_numeric_invalid_fractions(): + """Test fractions with zero in the denominator, which should gracefully fall back.""" + # "1/0" has "1" and "0" as number matches. The last number matched is "0", so it returns 0.0. + assert parse_numeric("1/0") == 0.0 + +def test_parse_numeric_with_text(): + """Test extracting numbers from strings containing text.""" + assert parse_numeric("The answer is 42.") == 42.0 + assert parse_numeric("My fraction is 3/4 and that's it.") == 0.75 + +def test_parse_numeric_none_or_empty(): + """Test None, empty, and whitespace strings.""" + assert parse_numeric(None) is None + assert parse_numeric("") is None + assert parse_numeric(" ") is None + assert parse_numeric("No numbers here!") is None + +def test_parse_numeric_multiple_numbers(): + """Test strings with multiple numbers. Should prefer the last numeric token.""" + assert parse_numeric("First 10 then 20.") == 20.0 + # re.search finds the first fraction. + assert parse_numeric("I have 1/2 and also 3/4.") == 0.5 + +def test_parse_numeric_invalid_types(): + """Test behavior with non-string types.""" + assert parse_numeric([]) is None + assert parse_numeric({}) is None + assert parse_numeric(123) is None