MLX-native port of Supertone's Supertonic 3 multilingual TTS. Runs the full flow-matching + classifier-free-guidance pipeline at ~x100 realtime on Apple Silicon, with audio cosine 1.0 vs the cached MLX path and cosine 0.98 vs the upstream ONNX Runtime reference. Weights are hosted at https://huggingface.co/ambassadia/supertonic-3-mlx and auto-downloaded on first use; this repository ships the port code, the model card, audio samples, and a zero-config setup_and_test.sh. Install: pip install git+https://gitea.tavportal.com/olivier/supertonic-3-mlx.git Quick test: git clone https://gitea.tavportal.com/olivier/supertonic-3-mlx.git cd supertonic-3-mlx && ./setup_and_test.sh Licenses (dual): model weights = BigScience Open RAIL-M (Section 4 propagation), port code = Apache-2.0. See LICENSE, LICENSE-CODE, NOTICE. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
132 lines
5.2 KiB
Bash
Executable File
132 lines
5.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Quick install + sanity-test for the supertonic-3-mlx standalone package.
|
|
#
|
|
# Creates a local ``.venv`` next to this script, installs the package and its
|
|
# runtime deps, version-checks MLX, downloads the model weights from the
|
|
# Hugging Face Hub on first run, and synthesises one short utterance to
|
|
# ``hello.wav``. Idempotent: re-running reuses the existing venv and cached
|
|
# weights.
|
|
#
|
|
# Usage:
|
|
# ./setup_and_test.sh # default: en F1, "Hello world…"
|
|
# ./setup_and_test.sh fr F2 "Bonjour."
|
|
#
|
|
set -euo pipefail
|
|
|
|
# ── 0. Inputs ────────────────────────────────────────────────────────
|
|
LANG_CODE="${1:-en}"
|
|
VOICE="${2:-F1}"
|
|
TEXT="${3:-Hello world from Apple Silicon. Supertonic 3 runs at one hundred times realtime.}"
|
|
|
|
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
VENV="$HERE/.venv"
|
|
|
|
# ── 1. Python version gate ──────────────────────────────────────────
|
|
if ! command -v python3 >/dev/null; then
|
|
echo "ERROR: python3 not found. Install Python 3.10+ first." >&2
|
|
exit 1
|
|
fi
|
|
PYVER="$(python3 -c 'import sys; print("%d.%d"%sys.version_info[:2])')"
|
|
PYMAJ="${PYVER%.*}"; PYMIN="${PYVER#*.}"
|
|
if [ "$PYMAJ" -lt 3 ] || { [ "$PYMAJ" -eq 3 ] && [ "$PYMIN" -lt 10 ]; }; then
|
|
echo "ERROR: Python 3.10+ required, found $PYVER." >&2
|
|
exit 1
|
|
fi
|
|
echo "→ python3: $PYVER"
|
|
|
|
# ── 2. venv ─────────────────────────────────────────────────────────
|
|
if [ ! -x "$VENV/bin/python" ]; then
|
|
echo "→ creating venv at $VENV …"
|
|
python3 -m venv "$VENV"
|
|
fi
|
|
PIP="$VENV/bin/pip"
|
|
PY="$VENV/bin/python"
|
|
|
|
# ── 3. dependencies ─────────────────────────────────────────────────
|
|
echo "→ installing dependencies …"
|
|
"$PIP" install --quiet --upgrade pip
|
|
# Install the package + the optional runtime deps. The package itself pulls in
|
|
# mlx + numpy via its pyproject.toml; we add huggingface_hub for the Hub
|
|
# download path, hf_transfer for large-blob throughput, and soundfile so the
|
|
# test script can write a WAV.
|
|
"$PIP" install --quiet "$HERE" huggingface_hub soundfile
|
|
|
|
# ── 4. MLX version gate + optional patch hook ───────────────────────
|
|
"$PY" - <<'PYEOF'
|
|
import sys
|
|
|
|
try:
|
|
import mlx.core as mx
|
|
except ImportError:
|
|
print("ERROR: mlx not importable. Are you on Apple Silicon? "
|
|
"MLX is macOS-on-Apple-Silicon only.", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
ver_str = getattr(mx, "__version__", "0.0.0")
|
|
ver = tuple(int(p) for p in ver_str.split(".")[:3] if p.isdigit())
|
|
print(f"→ mlx version: {ver_str}")
|
|
|
|
# Minimum tested combination — bumped as the upstream API changes.
|
|
MIN_OK = (0, 21, 0)
|
|
if ver < MIN_OK:
|
|
print(f" WARNING: mlx < {'.'.join(map(str, MIN_OK))}. Upgrading …",
|
|
file=sys.stderr)
|
|
import subprocess
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install",
|
|
"--quiet", "--upgrade", "mlx"])
|
|
print(" → upgraded. Re-run the script to pick up the new version.",
|
|
file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
# Patches we currently know about — none. This is the slot where future
|
|
# MLX-specific shims would land (e.g. a workaround for an upstream Conv1d
|
|
# regression). Keep the dispatch table here so the script stays a single
|
|
# source of truth.
|
|
PATCHES: dict[tuple[int, int, int], str] = {
|
|
# (broken_version): "patch description"
|
|
}
|
|
applied = [desc for v, desc in PATCHES.items() if v == ver]
|
|
if applied:
|
|
for desc in applied:
|
|
print(f" applied patch: {desc}")
|
|
else:
|
|
print(f" no patches needed for mlx {ver_str}")
|
|
PYEOF
|
|
|
|
# ── 5. quickstart generate ──────────────────────────────────────────
|
|
echo "→ generating audio …"
|
|
LANG_CODE="$LANG_CODE" VOICE="$VOICE" TEXT="$TEXT" \
|
|
HF_HUB_DISABLE_XET=1 \
|
|
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
|
"$PY" - <<'PYEOF'
|
|
import os, time
|
|
from supertonic_3_mlx import Pipeline
|
|
import soundfile as sf
|
|
|
|
lang = os.environ["LANG_CODE"]
|
|
voice = os.environ["VOICE"]
|
|
text = os.environ["TEXT"]
|
|
|
|
# First call downloads ~ 400 MB of weights into the HF cache. Subsequent
|
|
# runs reuse the cache and load in ~ 20 ms.
|
|
t0 = time.perf_counter()
|
|
pipe = Pipeline.from_pretrained("ambassadia/supertonic-3-mlx")
|
|
load_t = time.perf_counter() - t0
|
|
print(f" load : {load_t*1000:.0f} ms")
|
|
|
|
# Warmup (compiles the kernel graph for this shape).
|
|
pipe.generate("Warm.", voice=voice, lang=lang)
|
|
|
|
t0 = time.perf_counter()
|
|
wav = pipe.generate(text, voice=voice, lang=lang, seed=42)
|
|
gen_t = time.perf_counter() - t0
|
|
dur = len(wav) / pipe.sample_rate
|
|
print(f" generate : {gen_t*1000:.0f} ms")
|
|
print(f" audio : {dur:.2f} s ({len(wav)} samples @ {pipe.sample_rate} Hz)")
|
|
print(f" RTF : x{dur/gen_t:.0f}")
|
|
print(f" max amp : {abs(wav).max():.4f}")
|
|
|
|
sf.write("hello.wav", wav, pipe.sample_rate)
|
|
print("\n✓ wrote hello.wav — open it to verify the synthesis sounds correct.")
|
|
PYEOF
|