MLX-native port of Supertone's Supertonic 3 multilingual TTS. Runs the full flow-matching + classifier-free-guidance pipeline at ~x100 realtime on Apple Silicon, with audio cosine 1.0 vs the cached MLX path and cosine 0.98 vs the upstream ONNX Runtime reference. Weights are hosted at https://huggingface.co/ambassadia/supertonic-3-mlx and auto-downloaded on first use; this repository ships the port code, the model card, audio samples, and a zero-config setup_and_test.sh. Install: pip install git+https://gitea.tavportal.com/olivier/supertonic-3-mlx.git Quick test: git clone https://gitea.tavportal.com/olivier/supertonic-3-mlx.git cd supertonic-3-mlx && ./setup_and_test.sh Licenses (dual): model weights = BigScience Open RAIL-M (Section 4 propagation), port code = Apache-2.0. See LICENSE, LICENSE-CODE, NOTICE. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
226 lines
14 KiB
JSON
226 lines
14 KiB
JSON
{
|
|
"models": [
|
|
{
|
|
"model": "VectorEstimator",
|
|
"onnx": "/tmp/supertonic3/model/onnx/vector_estimator.onnx",
|
|
"safetensors": "/Users/transcrilive/MLX_CONVERTOR/sub-projects/supertonic3-mlx/hf_release/weights/vector_estimator.safetensors",
|
|
"bytes": 256053073,
|
|
"sha256": "2359240f2dcaee03b4800102aa0bea00223d2867ab752ef01af2b1cfaf92f3a6",
|
|
"weights_kept": 351,
|
|
"weights_dropped": 120,
|
|
"dropped_detail": {
|
|
"tts.ae.vector_field.proj_in.net.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.1.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.1.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.1.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.1.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.2.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.2.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.2.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.2.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.3.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.3.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.3.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.0.convnext.3.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.2.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.2.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.2.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.2.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.4.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.4.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.4.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.4.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.1.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.1.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.1.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.1.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.2.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.2.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.2.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.2.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.3.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.3.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.3.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.6.convnext.3.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.8.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.8.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.8.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.8.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.10.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.10.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.10.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.10.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.1.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.1.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.1.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.1.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.2.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.2.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.2.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.2.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.3.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.3.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.3.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.12.convnext.3.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.14.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.14.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.14.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.14.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.16.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.16.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.16.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.16.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.1.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.1.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.1.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.1.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.2.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.2.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.2.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.2.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.3.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.3.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.3.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.18.convnext.3.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.20.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.20.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.20.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.20.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.22.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.22.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.22.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.main_blocks.22.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.0.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.0.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.0.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.0.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.1.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.1.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.1.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.1.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.2.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.2.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.2.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.2.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.3.pwconv1.weight": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.3.pwconv1.bias": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.3.pwconv2.weight": "not-in-model",
|
|
"tts.ae.vector_field.last_convnext.convnext.3.pwconv2.bias": "not-in-model",
|
|
"tts.ae.vector_field.proj_out.net.weight": "not-in-model",
|
|
"<missing>.vector_field.main_blocks.9.attn.theta": "expected-but-not-extracted",
|
|
"<missing>.vector_field.main_blocks.9.attn.increments": "expected-but-not-extracted",
|
|
"<missing>.vector_field.main_blocks.15.attn.theta": "expected-but-not-extracted",
|
|
"<missing>.vector_field.main_blocks.15.attn.increments": "expected-but-not-extracted",
|
|
"<missing>.vector_field.main_blocks.21.attn.theta": "expected-but-not-extracted",
|
|
"<missing>.vector_field.main_blocks.21.attn.increments": "expected-but-not-extracted"
|
|
},
|
|
"elapsed_s": 0.289
|
|
},
|
|
{
|
|
"model": "TextEncoder",
|
|
"onnx": "/tmp/supertonic3/model/onnx/text_encoder.onnx",
|
|
"safetensors": "/Users/transcrilive/MLX_CONVERTOR/sub-projects/supertonic3-mlx/hf_release/weights/text_encoder.safetensors",
|
|
"bytes": 36022466,
|
|
"sha256": "9df20bb79496718b36d2c0fc37636d3f78d6ef751b2899ff6dfeb975ae737ada",
|
|
"weights_kept": 146,
|
|
"weights_dropped": 0,
|
|
"dropped_detail": {},
|
|
"elapsed_s": 0.035
|
|
},
|
|
{
|
|
"model": "DurationPredictor",
|
|
"onnx": "/tmp/supertonic3/model/onnx/duration_predictor.onnx",
|
|
"safetensors": "/Users/transcrilive/MLX_CONVERTOR/sub-projects/supertonic3-mlx/hf_release/weights/duration_predictor.safetensors",
|
|
"bytes": 3470807,
|
|
"sha256": "cd473acb6e0ac27426084488ccb3b3cc184e70d05db90897e2b892846db5dcb3",
|
|
"weights_kept": 98,
|
|
"weights_dropped": 0,
|
|
"dropped_detail": {},
|
|
"elapsed_s": 0.007
|
|
},
|
|
{
|
|
"model": "Vocoder",
|
|
"onnx": "/tmp/supertonic3/model/onnx/vocoder.onnx",
|
|
"safetensors": "/Users/transcrilive/MLX_CONVERTOR/sub-projects/supertonic3-mlx/hf_release/weights/vocoder.safetensors",
|
|
"bytes": 101364763,
|
|
"sha256": "b2ec31ab7c554f6e15b9a6780554b5d3502345de7848b310966bfb4e1ea4e526",
|
|
"weights_kept": 103,
|
|
"weights_dropped": 0,
|
|
"dropped_detail": {},
|
|
"elapsed_s": 0.079
|
|
}
|
|
],
|
|
"ancillary": [
|
|
{
|
|
"name": "unicode_indexer.json",
|
|
"bytes": 277676,
|
|
"sha256": "9bf7346e43883a81f8645c81224f786d43c5b57f3641f6e7671a7d6c493cb24f"
|
|
},
|
|
{
|
|
"name": "voice_styles/F1.json",
|
|
"bytes": 292046,
|
|
"sha256": "bbdec6ee00231c2c742ad05483df5334cab3b52fda3ba38e6a07059c4563dbc2"
|
|
},
|
|
{
|
|
"name": "voice_styles/F2.json",
|
|
"bytes": 292423,
|
|
"sha256": "7c722c6a72707b1a77f035d67f0d1351ba187738e06f7683e8c72b1df3477fc6"
|
|
},
|
|
{
|
|
"name": "voice_styles/F3.json",
|
|
"bytes": 290794,
|
|
"sha256": "12f6ef2573baa2defa1128069cb59f203e3ab67c92af77b42df8a0e3a2f7c6ab"
|
|
},
|
|
{
|
|
"name": "voice_styles/F4.json",
|
|
"bytes": 291808,
|
|
"sha256": "c2fa764c1225a76dfc3e2c73e8aa4f70d9ee48793860eb34c295fff01c2e032b"
|
|
},
|
|
{
|
|
"name": "voice_styles/F5.json",
|
|
"bytes": 291479,
|
|
"sha256": "45966e73316415626cf41a7d1c6f3b4c70dbc1ba2bee5c1978ef0ce33244fc8d"
|
|
},
|
|
{
|
|
"name": "voice_styles/M1.json",
|
|
"bytes": 291748,
|
|
"sha256": "e35604687f5d23694b8e91593a93eec0e4eca6c0b02bb8ed69139ab2ea6b0a5b"
|
|
},
|
|
{
|
|
"name": "voice_styles/M2.json",
|
|
"bytes": 292055,
|
|
"sha256": "b76cbf62bac707c710cf0ae5aba5e31eea1a6339a9734bfae33ab98499534a50"
|
|
},
|
|
{
|
|
"name": "voice_styles/M3.json",
|
|
"bytes": 290198,
|
|
"sha256": "ea1ac35ccb91b0d7ecad533a2fbd0eec10c91513d8951e3b25fbba99954e159b"
|
|
},
|
|
{
|
|
"name": "voice_styles/M4.json",
|
|
"bytes": 291522,
|
|
"sha256": "ca8eefad4fcd989c9379032ff3e50738adc547eeb5e221b82593a6d7b3bac303"
|
|
},
|
|
{
|
|
"name": "voice_styles/M5.json",
|
|
"bytes": 291469,
|
|
"sha256": "dd22b92740314321f8ae11c5e87f8dd60d060f15dd3a632b5adf77f471f77af2"
|
|
}
|
|
]
|
|
} |