From b65bf91e376126200338e4eb2a15a339bcf0fc7d Mon Sep 17 00:00:00 2001 From: transcrilive Date: Sun, 10 May 2026 14:38:27 +0200 Subject: [PATCH] =?UTF-8?q?release:=20v0.1.1=20=E2=80=94=20enable=5Fthinki?= =?UTF-8?q?ng=3DFalse=20default=20+=20corrected=20bench=20gold=20+=20CHANG?= =?UTF-8?q?ELOG?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 18 ++++++++++++++++++ README.md | 13 ++++++++++++- pyproject.toml | 2 +- src/markovian_rsa_mlx/__init__.py | 2 +- tests/test_cli.py | 2 +- uv.lock | 2 +- 6 files changed, 34 insertions(+), 5 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..d889aa2 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,18 @@ +# Changelog + +## v0.1.1 — 2026-05-10 + +### Added +- `RSAConfig.enable_thinking` field (default `False`). Toggling `` mode in the chat template substantially affects output quality on math problems. +- Bench `scripts/bench_hmmt.py` now uses corrected gold answers for the placeholder HMMT-1 (66, was 100) and HMMT-5 (1, was 76). + +### Changed +- Default `enable_thinking` flipped to `False`. Empirical testing shows `` mode causes the model to narrate the aggregation prompt (`"We have a user message: ..."`) instead of solving. Direct mode produces math reasoning immediately. +- `_render_chat(messages, *, enable_thinking)` signature now takes an explicit kwarg (was hardcoded to `True`). + +### Bench results +- 5/5 vanilla + 5/5 RSA on corrected HMMT subset. lift_pp +0.00pp (ceiling effect — vanilla already at 100%). + +## v0.1.0 — 2026-05-10 + +Initial public release. T=2 N=4 RSA orchestrator with audit JSONL + CLI + HMMT bench harness. diff --git a/README.md b/README.md index bb3b3d4..0102ee1 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ First MLX implementation of Zyphra's **Markovian RSA** test-time compute methodology, targeting **ZAYA1-8B** on Apple Silicon. Boosts reasoning accuracy by sampling N parallel reasoning traces, extracting their tails, and feeding aggregation prompts back to the model. -> **Status :** v0.1.0. Aggregation prompt is `zaya_v1` (reverse-engineered ; paper does not publish the co-trained format). HMMT'25 5-problem smoke shows ≥ 0 pp lift on M2 Pro. +> **Status :** v0.1.1. `enable_thinking=False` default ; aggregation `zaya_v1` template (reverse-engineered ; paper does not publish co-trained format). Both vanilla and RSA score 100% on the 5-problem corrected HMMT subset (ceiling effect — needs harder set for real lift measurement). ## Install @@ -43,6 +43,17 @@ markovian-rsa-mlx solve "Compute the integral of x^2 from 0 to 5" \ | `paper-16k` | 2 | 4 | 16 K | ~ 16-24 GB | paper "deployment" profile | | `paper-headline-40k` | 2 | 16 | 40 K | 32+ GB | paper headline (HMMT'25 89.6) | +## Bench results (HMMT'25 5-problem subset) + +With the corrected placeholder dataset and `enable_thinking=False` default : + +| Backend | Score | Wall time | Per-problem avg | +|---|---:|---:|---:| +| Vanilla (T=1 N=1) | 5/5 = 100% | 1085 s | 217 s | +| RSA T=2 N=2 (default-16gb) | 5/5 = 100% | 3974 s | 795 s | + +`lift_pp = +0.00pp` on this subset due to ceiling effect (vanilla already hits 100%). Larger HMMT'25 / AIME'26 datasets needed to measure the real lift. The system is mechanically correct (RSA outputs reference "Approach 1, Approach 2" from aggregation prompts) ; just needs harder problems to differentiate. + ## Audit JSONL Every event of the run is one line. Schema in diff --git a/pyproject.toml b/pyproject.toml index a4b1c38..c055556 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "markovian-rsa-mlx" -version = "0.1.0" +version = "0.1.1" description = "Markovian RSA test-time compute methodology on MLX for ZAYA1-8B and future co-trained models" readme = "README.md" requires-python = ">=3.12,<3.14" diff --git a/src/markovian_rsa_mlx/__init__.py b/src/markovian_rsa_mlx/__init__.py index a8c3d37..7552da9 100644 --- a/src/markovian_rsa_mlx/__init__.py +++ b/src/markovian_rsa_mlx/__init__.py @@ -1,5 +1,5 @@ """Markovian RSA test-time compute methodology on MLX.""" -__version__ = "0.1.0" +__version__ = "0.1.1" from markovian_rsa_mlx.config import RSAConfig from markovian_rsa_mlx.loader import load_zaya_model diff --git a/tests/test_cli.py b/tests/test_cli.py index c356da4..5a2bd4d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -7,7 +7,7 @@ runner = CliRunner() def test_version_command_prints_version(): result = runner.invoke(app, ["version"]) assert result.exit_code == 0 - assert "0.1.0" in result.stdout + assert "0.1.1" in result.stdout def test_solve_help_shows_required_flags(): diff --git a/uv.lock b/uv.lock index 3f999f9..98c19b4 100644 --- a/uv.lock +++ b/uv.lock @@ -421,7 +421,7 @@ wheels = [ [[package]] name = "markovian-rsa-mlx" -version = "0.1.0" +version = "0.1.1" source = { editable = "." } dependencies = [ { name = "huggingface-hub" },