commit 2b1a3c1312df4a06199d96d06bcee4fc0473bd7c Author: transcrilive Date: Sat May 9 16:05:39 2026 +0200 feat: initial public release v0.1.0 — MLX port of pyannote-speaker-diarization-3.1 Byte-parity with pyannote-PyTorch reference (cosine 0.763718 identical at 6 decimals on 200 cross-window slot pairs). 2.5x faster than pyannote-MPS on Apple Silicon native. Extracted from gitea.tavportal.com/olivier/MLX_CONVERTOR commit 5f9eafa. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fc6eea7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +__pycache__/ +*.py[cod] +*.class +*.so +.Python +.venv/ +venv/ +ENV/ +dist/ +build/ +*.egg-info/ +.eggs/ +.DS_Store +.env +*.log +.pytest_cache/ +.ruff_cache/ +*.orig diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..08999ae --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Olivier Dupont + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..81791ef --- /dev/null +++ b/README.md @@ -0,0 +1,57 @@ +# pyannote-speaker-diarization-3.1-mlx + +First MLX port of pyannote-speaker-diarization-3.1 with byte-parity to the PyTorch reference. 2.5x faster than pyannote-MPS on Apple Silicon native. + +## Install + +```bash +uv add "pyannote-speaker-diarization-3.1-mlx @ git+https://gitea.tavportal.com/olivier/pyannote-speaker-diarization-3.1-mlx.git" +``` + +## Quickstart + +```python +from pyannote_diarization_3_1_mlx import MlxDiarizationPipeline + +pipeline = MlxDiarizationPipeline.from_pretrained("pyannote/speaker-diarization-3.1") +diarization = pipeline("audio.wav") + +for turn, _, speaker in diarization.itertracks(yield_label=True): + print(f"{turn.start:.1f}s - {turn.end:.1f}s {speaker}") +``` + +## Parity + +| Evidence | MLX | Reference | Result | +| --- | --- | --- | --- | +| Cosine distance (200 cross-window pairs) | mean=0.763718 | pyannote-PyTorch mean=0.763718 | identical at 6 decimals | +| 5h10 bench | 173s / 44 speakers / 1.27 GB | pyannote-MPS 431s / 43 speakers / 1.72 GB | Cross-DER 0.076 | + +## Architecture + +SincNet → BiLSTM → Powerset(3,2) head + WeSpeaker ResNet34 speaker embedding + AgglomerativeClustering wrapper. + +## Module Naming + +The repository name is `pyannote-speaker-diarization-3.1-mlx`; the Python import is `pyannote_diarization_3_1_mlx`. The import name follows PEP 8 and embeds the pyannote model version so future 4.0 ports can co-install. + +## Citation + +This project ports the pyannote speaker diarization 3.1 pipeline architecture to MLX. Please cite the original pyannote.audio work when using this package: + +```bibtex +@inproceedings{Plaquet23, + author = {Alexis Plaquet and Hervé Bredin}, + title = {{Powerset multi-class cross entropy loss for neural speaker diarization}}, + booktitle = {Proc. INTERSPEECH 2023}, + year = {2023}, +} +``` + +## Provenance + +Extracted from MLX_CONVERTOR/src/mlxconv/diar at commit 5f9eafa. Maintained at https://gitea.tavportal.com/olivier/pyannote-speaker-diarization-3.1-mlx. + +## License + +MIT diff --git a/docs/parity-evidence.md b/docs/parity-evidence.md new file mode 100644 index 0000000..1c77faa --- /dev/null +++ b/docs/parity-evidence.md @@ -0,0 +1,7 @@ +# Parity Evidence + +| Evidence | MLX | Reference | Result | +| --- | --- | --- | --- | +| Cosine distance parity | 200 cross-window pairs, mean 0.763718 | pyannote-PyTorch mean 0.763718 | identical at 6 decimals | +| 5h10 bench results | 173s wall / 44 speakers / 1.27 GB peak RSS | pyannote-MPS 431s / 43 speakers / 1.72 GB | Cross-DER 0.076 | +| Source commits | 8aa6c6d + 5f9eafa | feat/platform-abc in MLX_CONVERTOR | extraction source | diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..851e0c1 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,36 @@ +[project] +name = "pyannote-speaker-diarization-3.1-mlx" +version = "0.1.0" +description = "MLX port of pyannote/speaker-diarization-3.1 with byte-parity to PyTorch reference" +readme = "README.md" +requires-python = ">=3.12,<3.14" +authors = [{ name = "Olivier Dupont", email = "olivier.dupont@taviramonaco.com" }] +license = { text = "MIT" } +keywords = ["mlx", "pyannote", "speaker-diarization", "apple-silicon"] +classifiers = [ + "Programming Language :: Python :: 3.12", + "License :: OSI Approved :: MIT License", + "Operating System :: MacOS", +] +dependencies = [ + "mlx>=0.21.0", + "torch>=2.5.0", + "torchaudio>=2.5.0", + "huggingface_hub>=0.26.0", + "safetensors>=0.4.5", + "librosa>=0.10.2", + "scipy>=1.14", + "numpy>=2.0", + "pyannote.audio>=4.0.4", +] + +[project.optional-dependencies] +bench = ["psutil>=7.0"] +dev = ["pytest>=8.3", "pytest-mock>=3.14", "ruff>=0.7"] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/pyannote_diarization_3_1_mlx"] diff --git a/scripts/bench.py b/scripts/bench.py new file mode 100644 index 0000000..c949976 --- /dev/null +++ b/scripts/bench.py @@ -0,0 +1,161 @@ +"""Benchmark MLX vs pyannote-MPS diarization on the same audio. + +Usage: + uv run python scripts/benchmark_diar_backends.py