From d32aaae32def2b5745c9e7dae0804882bf5c384e Mon Sep 17 00:00:00 2001 From: ambassadia Date: Wed, 20 May 2026 12:25:15 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20create=5Fvoice()=20=E2=80=94=20mix=20pr?= =?UTF-8?q?esets=20to=20synthesise=20custom=20voices?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 10 preset voices live on a hypersphere of radius ≈ 7.1 in the 12 800-D style-token space (verified empirically: pairwise cosines 0.86-0.97, SVD shows 7 axes cover 99 % of variance). Linear or spherical interpolation between presets stays in the trained distribution and produces new intelligible voices. API: voice = pipe.create_voice({'F2': 0.7, 'M1': 0.3}) # slerp by default voice = pipe.create_voice({'F2': 0.5, 'M1': 0.5}, interp='lerp') wav = pipe.generate('Bonjour', voice=voice, lang='fr') The voice argument of pipe.generate() now accepts either a preset name (str) or a custom voice descriptor (dict from create_voice). Whisper validation on 6 custom blends (FR test phrase): F2 70 / M1 30 → 100 % (lightly androgyne F voice) F2 50 / M1 50 → 91 % (true androgyne) avg of 5 F voices → 100 % (mean feminine timbre) avg of 5 M voices → 91 % (mean masculine timbre) warm fem (F4+F5) → 91 % bright masc (M1+M5) → 100 % All blends remain intelligible — the trained voice manifold is convex enough that interpolations don't fall out of the model's distribution. Example script in examples/custom_voice_demo.py. --- examples/custom_voice_demo.py | 44 ++++++++++++ src/supertonic_3_mlx/pipeline.py | 112 ++++++++++++++++++++++++++++++- 2 files changed, 153 insertions(+), 3 deletions(-) create mode 100644 examples/custom_voice_demo.py diff --git a/examples/custom_voice_demo.py b/examples/custom_voice_demo.py new file mode 100644 index 0000000..780cbe1 --- /dev/null +++ b/examples/custom_voice_demo.py @@ -0,0 +1,44 @@ +"""Create custom voices by mixing presets. + +The 10 preset voices (F1..F5, M1..M5) live on a hypersphere of radius ≈ 7.1 +in a 12 800-D style-token space. Spherical-linear interpolation (slerp) +between any two presets lands in the trained distribution and produces a +new, intelligible voice. + + pip install soundfile + python examples/custom_voice_demo.py +""" +from supertonic_3_mlx import Pipeline +import soundfile as sf + +pipe = Pipeline.from_pretrained("ambassadia/supertonic-3-mlx") + +TEXT = "Bonjour, je suis une voix personnalisée créée par interpolation des voix préréglées." + +# 1. A 70 / 30 mix of two presets — primary F2, slight masculine tint from M1. +voice = pipe.create_voice({"F2": 0.7, "M1": 0.3}) +wav = pipe.generate(TEXT, voice=voice, lang="fr") +sf.write("voice_F2_M1.wav", wav, pipe.sample_rate) +print("wrote voice_F2_M1.wav (70 % F2, 30 % M1, slerp)") + +# 2. Average of all five female voices — 'mean feminine' timbre. +voice = pipe.create_voice({f"F{i}": 0.2 for i in range(1, 6)}) +wav = pipe.generate(TEXT, voice=voice, lang="fr") +sf.write("voice_avg_female.wav", wav, pipe.sample_rate) +print("wrote voice_avg_female.wav") + +# 3. Linear interpolation (lerp) instead of slerp — gives a slightly +# different timbre because lerp doesn't preserve the hypersphere norm. +voice = pipe.create_voice({"F4": 0.6, "F5": 0.4}, interp="lerp") +wav = pipe.generate(TEXT, voice=voice, lang="fr") +sf.write("voice_warm_lerp.wav", wav, pipe.sample_rate) +print("wrote voice_warm_lerp.wav (lerp)") + +# 4. A custom voice descriptor is just a dict — you can hand-build it, +# save it to JSON, share it. The `style_ttl` shape is (1, 50, 256) and +# `style_dp` shape is (1, 8, 16); both float32. Norms ≈ 7.1 and ≈ 0.3 +# respectively across the 10 presets. +print(f"\nVoice descriptor keys: {sorted(voice.keys())}") +print(f" style_ttl shape: {voice['style_ttl'].shape}") +print(f" style_dp shape: {voice['style_dp'].shape}") +print(f" blend metadata: {voice['_meta']}") diff --git a/src/supertonic_3_mlx/pipeline.py b/src/supertonic_3_mlx/pipeline.py index 7ab6a92..058a6d3 100644 --- a/src/supertonic_3_mlx/pipeline.py +++ b/src/supertonic_3_mlx/pipeline.py @@ -482,13 +482,114 @@ class SupertonicMLXPipeline: m_.update(tree_map(_cast, m_.parameters())) def _load_voice(self, voice: str) -> tuple[mx.array, mx.array]: - """Load ``voice_styles/.json`` and return (style_ttl, style_dp).""" + """Load ``voice_styles/.json`` and return (style_ttl, style_dp). + + ``voice`` can be either a preset name (``"F1"``..``"F5"``, + ``"M1"``..``"M5"``) or a custom voice constructed via + :meth:`create_voice` (then ``voice`` is the dict directly — but + the helper inside :meth:`generate` handles that case). + """ path = self.voice_dir / f"{voice}.json" data = json.loads(path.read_text()) style_ttl = np.asarray(data["style_ttl"]["data"], dtype=np.float32) # (1, 50, 256) style_dp = np.asarray(data["style_dp"]["data"], dtype=np.float32) # (1, 8, 16) return mx.array(style_ttl), mx.array(style_dp) + # ── Voice mixing API ────────────────────────────────────────────── + def create_voice(self, blend: dict[str, float], + interp: str = "slerp") -> dict[str, mx.array]: + """Create a custom voice as a weighted mix of preset voices. + + The voice style is a 50×256 ``style_ttl`` tensor that lives on a + 12 800-D hypersphere of radius ≈ 7.1 (verified empirically across + the 10 presets). Linear or spherical interpolation between the + preset points stays in the trained distribution and produces + intelligible new voices. + + Args: + blend: mapping ``preset_name → weight``. Weights are + renormalised to sum to 1. Use 2-4 voices for best + results; mixing more than 4 tends toward the centroid. + interp: ``"slerp"`` (default, spherical interpolation, + preserves norm — recommended) or ``"lerp"`` (linear + weighted average, then renormalise). + + Returns: + A custom voice descriptor (a dict) that can be passed + anywhere the API takes a ``voice=...`` argument. + + Examples: + # 70 % F2 + 30 % M1 → semi-androgynous + voice = pipe.create_voice({"F2": 0.7, "M1": 0.3}) + wav = pipe.generate("Bonjour", voice=voice, lang="fr") + + # Equal mix of all 5 male voices → 'average male' timbre + avg_male = pipe.create_voice({f"M{i}": 0.2 for i in range(1, 6)}) + """ + if not blend: + raise ValueError("blend dict cannot be empty") + if interp not in ("slerp", "lerp"): + raise ValueError(f"interp must be 'slerp' or 'lerp', got {interp!r}") + + # Load each preset, normalise weights + total = sum(blend.values()) + if total <= 0: + raise ValueError(f"blend weights must sum to > 0, got {total}") + weights = {k: v / total for k, v in blend.items()} + + ttls: list[tuple[float, np.ndarray]] = [] + dps: list[tuple[float, np.ndarray]] = [] + norms: list[float] = [] + for preset, w in weights.items(): + stl, sdp = self._load_voice(preset) + stl_np = np.array(stl) + ttls.append((w, stl_np)) + dps.append((w, np.array(sdp))) + norms.append(float(np.linalg.norm(stl_np.flatten()))) + target_norm = float(np.mean(norms)) + + if interp == "lerp": + mixed_ttl = sum(w * x for w, x in ttls) + mixed_dp = sum(w * x for w, x in dps) + else: + # SLERP across multiple voices: chain pairwise — order matters. + # We use a stable iterative slerp from the highest-weighted voice + # outward (so the final point reflects the dominant voice). + ordered = sorted(zip(weights.values(), ttls, dps), + key=lambda t: -t[0]) + cum_w = ordered[0][0] + mixed_ttl = ordered[0][1][1].copy() + mixed_dp = ordered[0][2][1].copy() + for w, (w_, stl), (_, sdp) in ordered[1:]: + # The slerp t for this addition is w / (cum_w + w) + t = w / (cum_w + w) + a = mixed_ttl.flatten() + b = stl.flatten() + na, nb = np.linalg.norm(a), np.linalg.norm(b) + dot = (a @ b) / (na * nb + 1e-8) + theta = float(np.arccos(np.clip(dot, -1, 1))) + if theta < 1e-6: + mixed_ttl = (1 - t) * mixed_ttl + t * stl + else: + sin_t = np.sin(theta) + coef_a = np.sin((1 - t) * theta) / sin_t + coef_b = np.sin(t * theta) / sin_t + mixed_ttl = (coef_a * a + coef_b * b).reshape(mixed_ttl.shape) + # dp is small + low-norm, lerp is fine + mixed_dp = (1 - t) * mixed_dp + t * sdp + cum_w += w + + # Renormalise ttl to the average source norm + cur_norm = float(np.linalg.norm(mixed_ttl.flatten())) + if cur_norm > 1e-6: + mixed_ttl = mixed_ttl * (target_norm / cur_norm) + + return { + "style_ttl": mx.array(mixed_ttl.astype(np.float32)), + "style_dp": mx.array(mixed_dp.astype(np.float32)), + "_meta": {"blend": dict(weights), "interp": interp}, + } + def generate( self, text: str, @@ -519,8 +620,13 @@ class SupertonicMLXPipeline: T_text = text_ids.shape[1] text_mask = mx.ones((1, 1, T_text), dtype=self.dtype) - # Style - style_ttl, style_dp = self._load_voice(voice) + # Style — accept either a preset name (str) or a custom voice descriptor + # (dict returned by ``create_voice``). + if isinstance(voice, dict): + style_ttl = voice["style_ttl"] + style_dp = voice["style_dp"] + else: + style_ttl, style_dp = self._load_voice(voice) if self.dtype != mx.float32: style_ttl = style_ttl.astype(self.dtype) style_dp = style_dp.astype(self.dtype)