v0.1.0 — initial release

MLX-native port of Supertone's Supertonic 3 multilingual TTS. Runs the full flow-matching + classifier-free-guidance pipeline at ~x100 realtime on Apple Silicon, with audio cosine 1.0 vs the cached MLX path and cosine 0.98 vs the upstream ONNX Runtime reference. Weights are hosted at https://huggingface.co/ambassadia/supertonic-3-mlx and auto-downloaded on first use; this repository ships the port code, the model card, audio samples, and a zero-config setup_and_test.sh. Install: pip install git+https://gitea.tavportal.com/olivier/supertonic-3-mlx.git Quick test: git clone https://gitea.tavportal.com/olivier/supertonic-3-mlx.git cd supertonic-3-mlx && ./setup_and_test.sh Licenses (dual): model weights = BigScience Open RAIL-M (Section 4 propagation), port code = Apache-2.0. See LICENSE, LICENSE-CODE, NOTICE. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 09:17:05 +02:00
commit 12dbf4a821
36 changed files with 3812 additions and 0 deletions
--- a/setup_and_test.sh
+++ b/setup_and_test.sh
@@ -0,0 +1,131 @@
+#!/usr/bin/env bash
+# Quick install + sanity-test for the supertonic-3-mlx standalone package.
+#
+# Creates a local ``.venv`` next to this script, installs the package and its
+# runtime deps, version-checks MLX, downloads the model weights from the
+# Hugging Face Hub on first run, and synthesises one short utterance to
+# ``hello.wav``. Idempotent: re-running reuses the existing venv and cached
+# weights.
+#
+# Usage:
+#   ./setup_and_test.sh                 # default: en F1, "Hello world…"
+#   ./setup_and_test.sh fr F2 "Bonjour."
+#
+set -euo pipefail
+
+# ── 0. Inputs ────────────────────────────────────────────────────────
+LANG_CODE="${1:-en}"
+VOICE="${2:-F1}"
+TEXT="${3:-Hello world from Apple Silicon. Supertonic 3 runs at one hundred times realtime.}"
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+VENV="$HERE/.venv"
+
+# ── 1. Python version gate ──────────────────────────────────────────
+if ! command -v python3 >/dev/null; then
+    echo "ERROR: python3 not found. Install Python 3.10+ first." >&2
+    exit 1
+fi
+PYVER="$(python3 -c 'import sys; print("%d.%d"%sys.version_info[:2])')"
+PYMAJ="${PYVER%.*}"; PYMIN="${PYVER#*.}"
+if [ "$PYMAJ" -lt 3 ] || { [ "$PYMAJ" -eq 3 ] && [ "$PYMIN" -lt 10 ]; }; then
+    echo "ERROR: Python 3.10+ required, found $PYVER." >&2
+    exit 1
+fi
+echo "→ python3: $PYVER"
+
+# ── 2. venv ─────────────────────────────────────────────────────────
+if [ ! -x "$VENV/bin/python" ]; then
+    echo "→ creating venv at $VENV …"
+    python3 -m venv "$VENV"
+fi
+PIP="$VENV/bin/pip"
+PY="$VENV/bin/python"
+
+# ── 3. dependencies ─────────────────────────────────────────────────
+echo "→ installing dependencies …"
+"$PIP" install --quiet --upgrade pip
+# Install the package + the optional runtime deps. The package itself pulls in
+# mlx + numpy via its pyproject.toml; we add huggingface_hub for the Hub
+# download path, hf_transfer for large-blob throughput, and soundfile so the
+# test script can write a WAV.
+"$PIP" install --quiet "$HERE" huggingface_hub soundfile
+
+# ── 4. MLX version gate + optional patch hook ───────────────────────
+"$PY" - <<'PYEOF'
+import sys
+
+try:
+    import mlx.core as mx
+except ImportError:
+    print("ERROR: mlx not importable. Are you on Apple Silicon? "
+          "MLX is macOS-on-Apple-Silicon only.", file=sys.stderr)
+    sys.exit(1)
+
+ver_str = getattr(mx, "__version__", "0.0.0")
+ver = tuple(int(p) for p in ver_str.split(".")[:3] if p.isdigit())
+print(f"→ mlx version: {ver_str}")
+
+# Minimum tested combination — bumped as the upstream API changes.
+MIN_OK = (0, 21, 0)
+if ver < MIN_OK:
+    print(f"  WARNING: mlx < {'.'.join(map(str, MIN_OK))}. Upgrading …",
+          file=sys.stderr)
+    import subprocess
+    subprocess.check_call([sys.executable, "-m", "pip", "install",
+                           "--quiet", "--upgrade", "mlx"])
+    print("  → upgraded. Re-run the script to pick up the new version.",
+          file=sys.stderr)
+    sys.exit(2)
+
+# Patches we currently know about — none. This is the slot where future
+# MLX-specific shims would land (e.g. a workaround for an upstream Conv1d
+# regression). Keep the dispatch table here so the script stays a single
+# source of truth.
+PATCHES: dict[tuple[int, int, int], str] = {
+    # (broken_version): "patch description"
+}
+applied = [desc for v, desc in PATCHES.items() if v == ver]
+if applied:
+    for desc in applied:
+        print(f"  applied patch: {desc}")
+else:
+    print(f"  no patches needed for mlx {ver_str}")
+PYEOF
+
+# ── 5. quickstart generate ──────────────────────────────────────────
+echo "→ generating audio …"
+LANG_CODE="$LANG_CODE" VOICE="$VOICE" TEXT="$TEXT" \
+HF_HUB_DISABLE_XET=1 \
+HF_HUB_ENABLE_HF_TRANSFER=1 \
+"$PY" - <<'PYEOF'
+import os, time
+from supertonic_3_mlx import Pipeline
+import soundfile as sf
+
+lang = os.environ["LANG_CODE"]
+voice = os.environ["VOICE"]
+text = os.environ["TEXT"]
+
+# First call downloads ~ 400 MB of weights into the HF cache. Subsequent
+# runs reuse the cache and load in ~ 20 ms.
+t0 = time.perf_counter()
+pipe = Pipeline.from_pretrained("ambassadia/supertonic-3-mlx")
+load_t = time.perf_counter() - t0
+print(f"  load        : {load_t*1000:.0f} ms")
+
+# Warmup (compiles the kernel graph for this shape).
+pipe.generate("Warm.", voice=voice, lang=lang)
+
+t0 = time.perf_counter()
+wav = pipe.generate(text, voice=voice, lang=lang, seed=42)
+gen_t = time.perf_counter() - t0
+dur = len(wav) / pipe.sample_rate
+print(f"  generate    : {gen_t*1000:.0f} ms")
+print(f"  audio       : {dur:.2f} s ({len(wav)} samples @ {pipe.sample_rate} Hz)")
+print(f"  RTF         : x{dur/gen_t:.0f}")
+print(f"  max amp     : {abs(wav).max():.4f}")
+
+sf.write("hello.wav", wav, pipe.sample_rate)
+print("\n✓ wrote hello.wav — open it to verify the synthesis sounds correct.")
+PYEOF