28 lines
835 B
Python
28 lines
835 B
Python
from scripts.bench_hmmt import extract_final_answer, score_subset
|
|
|
|
|
|
def test_extract_final_answer_picks_last_boxed():
|
|
text = "Long reasoning... \\boxed{42} done."
|
|
assert extract_final_answer(text) == "42"
|
|
|
|
|
|
def test_extract_final_answer_falls_back_to_last_number():
|
|
text = "...therefore the answer is 17."
|
|
assert extract_final_answer(text) == "17"
|
|
|
|
|
|
def test_extract_final_answer_returns_empty_when_no_number():
|
|
assert extract_final_answer("no answer here") == ""
|
|
|
|
|
|
def test_score_subset_counts_correct():
|
|
items = [
|
|
{"question": "q1", "answer": "42"},
|
|
{"question": "q2", "answer": "100"},
|
|
]
|
|
predictions = ["The answer is 42.", "Final: 99"]
|
|
score = score_subset(items, predictions)
|
|
assert score.correct == 1
|
|
assert score.total == 2
|
|
assert abs(score.accuracy - 0.5) < 1e-6
|