from scripts.bench_hmmt import extract_final_answer, score_subset def test_extract_final_answer_picks_last_boxed(): text = "Long reasoning... \\boxed{42} done." assert extract_final_answer(text) == "42" def test_extract_final_answer_falls_back_to_last_number(): text = "...therefore the answer is 17." assert extract_final_answer(text) == "17" def test_extract_final_answer_returns_empty_when_no_number(): assert extract_final_answer("no answer here") == "" def test_score_subset_counts_correct(): items = [ {"question": "q1", "answer": "42"}, {"question": "q2", "answer": "100"}, ] predictions = ["The answer is 42.", "Final: 99"] score = score_subset(items, predictions) assert score.correct == 1 assert score.total == 2 assert abs(score.accuracy - 0.5) < 1e-6