v0.1.0 — initial release
MLX-native port of Supertone's Supertonic 3 multilingual TTS. Runs the full flow-matching + classifier-free-guidance pipeline at ~x100 realtime on Apple Silicon, with audio cosine 1.0 vs the cached MLX path and cosine 0.98 vs the upstream ONNX Runtime reference. Weights are hosted at https://huggingface.co/ambassadia/supertonic-3-mlx and auto-downloaded on first use; this repository ships the port code, the model card, audio samples, and a zero-config setup_and_test.sh. Install: pip install git+https://gitea.tavportal.com/olivier/supertonic-3-mlx.git Quick test: git clone https://gitea.tavportal.com/olivier/supertonic-3-mlx.git cd supertonic-3-mlx && ./setup_and_test.sh Licenses (dual): model weights = BigScience Open RAIL-M (Section 4 propagation), port code = Apache-2.0. See LICENSE, LICENSE-CODE, NOTICE. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
131
setup_and_test.sh
Executable file
131
setup_and_test.sh
Executable file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env bash
|
||||
# Quick install + sanity-test for the supertonic-3-mlx standalone package.
|
||||
#
|
||||
# Creates a local ``.venv`` next to this script, installs the package and its
|
||||
# runtime deps, version-checks MLX, downloads the model weights from the
|
||||
# Hugging Face Hub on first run, and synthesises one short utterance to
|
||||
# ``hello.wav``. Idempotent: re-running reuses the existing venv and cached
|
||||
# weights.
|
||||
#
|
||||
# Usage:
|
||||
# ./setup_and_test.sh # default: en F1, "Hello world…"
|
||||
# ./setup_and_test.sh fr F2 "Bonjour."
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
# ── 0. Inputs ────────────────────────────────────────────────────────
|
||||
LANG_CODE="${1:-en}"
|
||||
VOICE="${2:-F1}"
|
||||
TEXT="${3:-Hello world from Apple Silicon. Supertonic 3 runs at one hundred times realtime.}"
|
||||
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
VENV="$HERE/.venv"
|
||||
|
||||
# ── 1. Python version gate ──────────────────────────────────────────
|
||||
if ! command -v python3 >/dev/null; then
|
||||
echo "ERROR: python3 not found. Install Python 3.10+ first." >&2
|
||||
exit 1
|
||||
fi
|
||||
PYVER="$(python3 -c 'import sys; print("%d.%d"%sys.version_info[:2])')"
|
||||
PYMAJ="${PYVER%.*}"; PYMIN="${PYVER#*.}"
|
||||
if [ "$PYMAJ" -lt 3 ] || { [ "$PYMAJ" -eq 3 ] && [ "$PYMIN" -lt 10 ]; }; then
|
||||
echo "ERROR: Python 3.10+ required, found $PYVER." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "→ python3: $PYVER"
|
||||
|
||||
# ── 2. venv ─────────────────────────────────────────────────────────
|
||||
if [ ! -x "$VENV/bin/python" ]; then
|
||||
echo "→ creating venv at $VENV …"
|
||||
python3 -m venv "$VENV"
|
||||
fi
|
||||
PIP="$VENV/bin/pip"
|
||||
PY="$VENV/bin/python"
|
||||
|
||||
# ── 3. dependencies ─────────────────────────────────────────────────
|
||||
echo "→ installing dependencies …"
|
||||
"$PIP" install --quiet --upgrade pip
|
||||
# Install the package + the optional runtime deps. The package itself pulls in
|
||||
# mlx + numpy via its pyproject.toml; we add huggingface_hub for the Hub
|
||||
# download path, hf_transfer for large-blob throughput, and soundfile so the
|
||||
# test script can write a WAV.
|
||||
"$PIP" install --quiet "$HERE" huggingface_hub soundfile
|
||||
|
||||
# ── 4. MLX version gate + optional patch hook ───────────────────────
|
||||
"$PY" - <<'PYEOF'
|
||||
import sys
|
||||
|
||||
try:
|
||||
import mlx.core as mx
|
||||
except ImportError:
|
||||
print("ERROR: mlx not importable. Are you on Apple Silicon? "
|
||||
"MLX is macOS-on-Apple-Silicon only.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
ver_str = getattr(mx, "__version__", "0.0.0")
|
||||
ver = tuple(int(p) for p in ver_str.split(".")[:3] if p.isdigit())
|
||||
print(f"→ mlx version: {ver_str}")
|
||||
|
||||
# Minimum tested combination — bumped as the upstream API changes.
|
||||
MIN_OK = (0, 21, 0)
|
||||
if ver < MIN_OK:
|
||||
print(f" WARNING: mlx < {'.'.join(map(str, MIN_OK))}. Upgrading …",
|
||||
file=sys.stderr)
|
||||
import subprocess
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install",
|
||||
"--quiet", "--upgrade", "mlx"])
|
||||
print(" → upgraded. Re-run the script to pick up the new version.",
|
||||
file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
# Patches we currently know about — none. This is the slot where future
|
||||
# MLX-specific shims would land (e.g. a workaround for an upstream Conv1d
|
||||
# regression). Keep the dispatch table here so the script stays a single
|
||||
# source of truth.
|
||||
PATCHES: dict[tuple[int, int, int], str] = {
|
||||
# (broken_version): "patch description"
|
||||
}
|
||||
applied = [desc for v, desc in PATCHES.items() if v == ver]
|
||||
if applied:
|
||||
for desc in applied:
|
||||
print(f" applied patch: {desc}")
|
||||
else:
|
||||
print(f" no patches needed for mlx {ver_str}")
|
||||
PYEOF
|
||||
|
||||
# ── 5. quickstart generate ──────────────────────────────────────────
|
||||
echo "→ generating audio …"
|
||||
LANG_CODE="$LANG_CODE" VOICE="$VOICE" TEXT="$TEXT" \
|
||||
HF_HUB_DISABLE_XET=1 \
|
||||
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
||||
"$PY" - <<'PYEOF'
|
||||
import os, time
|
||||
from supertonic_3_mlx import Pipeline
|
||||
import soundfile as sf
|
||||
|
||||
lang = os.environ["LANG_CODE"]
|
||||
voice = os.environ["VOICE"]
|
||||
text = os.environ["TEXT"]
|
||||
|
||||
# First call downloads ~ 400 MB of weights into the HF cache. Subsequent
|
||||
# runs reuse the cache and load in ~ 20 ms.
|
||||
t0 = time.perf_counter()
|
||||
pipe = Pipeline.from_pretrained("ambassadia/supertonic-3-mlx")
|
||||
load_t = time.perf_counter() - t0
|
||||
print(f" load : {load_t*1000:.0f} ms")
|
||||
|
||||
# Warmup (compiles the kernel graph for this shape).
|
||||
pipe.generate("Warm.", voice=voice, lang=lang)
|
||||
|
||||
t0 = time.perf_counter()
|
||||
wav = pipe.generate(text, voice=voice, lang=lang, seed=42)
|
||||
gen_t = time.perf_counter() - t0
|
||||
dur = len(wav) / pipe.sample_rate
|
||||
print(f" generate : {gen_t*1000:.0f} ms")
|
||||
print(f" audio : {dur:.2f} s ({len(wav)} samples @ {pipe.sample_rate} Hz)")
|
||||
print(f" RTF : x{dur/gen_t:.0f}")
|
||||
print(f" max amp : {abs(wav).max():.4f}")
|
||||
|
||||
sf.write("hello.wav", wav, pipe.sample_rate)
|
||||
print("\n✓ wrote hello.wav — open it to verify the synthesis sounds correct.")
|
||||
PYEOF
|
||||
Reference in New Issue
Block a user