supertonic-3-mlx/setup_and_test.sh

#!/usr/bin/env bash
# Quick install + sanity-test for the supertonic-3-mlx standalone package.
#
# Creates a local ``.venv`` next to this script, installs the package and its
# runtime deps, version-checks MLX, downloads the model weights from the
# Hugging Face Hub on first run, and synthesises one short utterance to
# ``hello.wav``. Idempotent: re-running reuses the existing venv and cached
# weights.
#
# Usage:
#   ./setup_and_test.sh                 # default: en F1, "Hello world…"
#   ./setup_and_test.sh fr F2 "Bonjour."
#
set -euo pipefail

# ── 0. Inputs ────────────────────────────────────────────────────────
LANG_CODE="${1:-en}"
VOICE="${2:-F1}"
TEXT="${3:-Hello world from Apple Silicon. Supertonic 3 runs at one hundred times realtime.}"

HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENV="$HERE/.venv"

# ── 1. Python version gate ──────────────────────────────────────────
if ! command -v python3 >/dev/null; then
    echo "ERROR: python3 not found. Install Python 3.10+ first." >&2
    exit 1
fi
PYVER="$(python3 -c 'import sys; print("%d.%d"%sys.version_info[:2])')"
PYMAJ="${PYVER%.*}"; PYMIN="${PYVER#*.}"
if [ "$PYMAJ" -lt 3 ] || { [ "$PYMAJ" -eq 3 ] && [ "$PYMIN" -lt 10 ]; }; then
    echo "ERROR: Python 3.10+ required, found $PYVER." >&2
    exit 1
fi
echo "→ python3: $PYVER"

# ── 2. venv ─────────────────────────────────────────────────────────
if [ ! -x "$VENV/bin/python" ]; then
    echo "→ creating venv at $VENV …"
    python3 -m venv "$VENV"
fi
PIP="$VENV/bin/pip"
PY="$VENV/bin/python"

# ── 3. dependencies ─────────────────────────────────────────────────
echo "→ installing dependencies …"
"$PIP" install --quiet --upgrade pip
# Install the package + the optional runtime deps. The package itself pulls in
# mlx + numpy via its pyproject.toml; we add huggingface_hub for the Hub
# download path, hf_transfer for large-blob throughput, and soundfile so the
# test script can write a WAV.
"$PIP" install --quiet "$HERE" huggingface_hub soundfile

# ── 4. MLX version gate + optional patch hook ───────────────────────
"$PY" - <<'PYEOF'
import sys

try:
    import mlx.core as mx
except ImportError:
    print("ERROR: mlx not importable. Are you on Apple Silicon? "
          "MLX is macOS-on-Apple-Silicon only.", file=sys.stderr)
    sys.exit(1)

ver_str = getattr(mx, "__version__", "0.0.0")
ver = tuple(int(p) for p in ver_str.split(".")[:3] if p.isdigit())
print(f"→ mlx version: {ver_str}")

# Minimum tested combination — bumped as the upstream API changes.
MIN_OK = (0, 21, 0)
if ver < MIN_OK:
    print(f"  WARNING: mlx < {'.'.join(map(str, MIN_OK))}. Upgrading …",
          file=sys.stderr)
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install",
                           "--quiet", "--upgrade", "mlx"])
    print("  → upgraded. Re-run the script to pick up the new version.",
          file=sys.stderr)
    sys.exit(2)

# Patches we currently know about — none. This is the slot where future
# MLX-specific shims would land (e.g. a workaround for an upstream Conv1d
# regression). Keep the dispatch table here so the script stays a single
# source of truth.
PATCHES: dict[tuple[int, int, int], str] = {
    # (broken_version): "patch description"
}
applied = [desc for v, desc in PATCHES.items() if v == ver]
if applied:
    for desc in applied:
        print(f"  applied patch: {desc}")
else:
    print(f"  no patches needed for mlx {ver_str}")
PYEOF

# ── 5. quickstart generate ──────────────────────────────────────────
echo "→ generating audio …"
LANG_CODE="$LANG_CODE" VOICE="$VOICE" TEXT="$TEXT" \
HF_HUB_DISABLE_XET=1 \
HF_HUB_ENABLE_HF_TRANSFER=1 \
"$PY" - <<'PYEOF'
import os, time
from supertonic_3_mlx import Pipeline
import soundfile as sf

lang = os.environ["LANG_CODE"]
voice = os.environ["VOICE"]
text = os.environ["TEXT"]

# First call downloads ~ 400 MB of weights into the HF cache. Subsequent
# runs reuse the cache and load in ~ 20 ms.
t0 = time.perf_counter()
pipe = Pipeline.from_pretrained("ambassadia/supertonic-3-mlx")
load_t = time.perf_counter() - t0
print(f"  load        : {load_t*1000:.0f} ms")

# Warmup (compiles the kernel graph for this shape).
pipe.generate("Warm.", voice=voice, lang=lang)

t0 = time.perf_counter()
wav = pipe.generate(text, voice=voice, lang=lang, seed=42)
gen_t = time.perf_counter() - t0
dur = len(wav) / pipe.sample_rate
print(f"  generate    : {gen_t*1000:.0f} ms")
print(f"  audio       : {dur:.2f} s ({len(wav)} samples @ {pipe.sample_rate} Hz)")
print(f"  RTF         : x{dur/gen_t:.0f}")
print(f"  max amp     : {abs(wav).max():.4f}")

sf.write("hello.wav", wav, pipe.sample_rate)
print("\n✓ wrote hello.wav — open it to verify the synthesis sounds correct.")
PYEOF