feat(bench): HMMT/AIME small-subset harness + answer extraction tests
This commit is contained in:
27
tests/test_bench_harness.py
Normal file
27
tests/test_bench_harness.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from scripts.bench_hmmt import extract_final_answer, score_subset
|
||||
|
||||
|
||||
def test_extract_final_answer_picks_last_boxed():
|
||||
text = "Long reasoning... \\boxed{42} done."
|
||||
assert extract_final_answer(text) == "42"
|
||||
|
||||
|
||||
def test_extract_final_answer_falls_back_to_last_number():
|
||||
text = "...therefore the answer is 17."
|
||||
assert extract_final_answer(text) == "17"
|
||||
|
||||
|
||||
def test_extract_final_answer_returns_empty_when_no_number():
|
||||
assert extract_final_answer("no answer here") == ""
|
||||
|
||||
|
||||
def test_score_subset_counts_correct():
|
||||
items = [
|
||||
{"question": "q1", "answer": "42"},
|
||||
{"question": "q2", "answer": "100"},
|
||||
]
|
||||
predictions = ["The answer is 42.", "Final: 99"]
|
||||
score = score_subset(items, predictions)
|
||||
assert score.correct == 1
|
||||
assert score.total == 2
|
||||
assert abs(score.accuracy - 0.5) < 1e-6
|
||||
Reference in New Issue
Block a user