Files
markovian-rsa-mlx/tests/test_bench_harness.py

28 lines
835 B
Python

from scripts.bench_hmmt import extract_final_answer, score_subset
def test_extract_final_answer_picks_last_boxed():
text = "Long reasoning... \\boxed{42} done."
assert extract_final_answer(text) == "42"
def test_extract_final_answer_falls_back_to_last_number():
text = "...therefore the answer is 17."
assert extract_final_answer(text) == "17"
def test_extract_final_answer_returns_empty_when_no_number():
assert extract_final_answer("no answer here") == ""
def test_score_subset_counts_correct():
items = [
{"question": "q1", "answer": "42"},
{"question": "q2", "answer": "100"},
]
predictions = ["The answer is 42.", "Final: 99"]
score = score_subset(items, predictions)
assert score.correct == 1
assert score.total == 2
assert abs(score.accuracy - 0.5) < 1e-6