feat: streaming generate_stream() with sub-100ms TTFB
Splits the input text at sentence-ending punctuation (with secondary
split on , ; : for sentences over 220 chars), yields one wav chunk
per clause. Callers can start playback as soon as chunk 0 arrives —
TTFB ~ 50 ms on M4 — while the rest synthesise in the background.
API:
for idx, wav in pipe.generate_stream('Phrase 1. Phrase 2.', voice='F1', lang='fr'):
play_audio(wav)
For non-streaming consumers:
chunks = [w for _, w in pipe.generate_stream(text, ...)]
full = pipe.concat_chunks(chunks, gap_ms=80)
Bench on a 23 s French paragraph (M3 Ultra):
chunks: 6
TTFB: 54 ms (first 2.44 s audio chunk ready)
total: 410 ms (RTF x56)
Whisper: 98 % word overlap on concat
The 80 ms inter-chunk silence in concat_chunks roughly matches the
natural breathing pause between sentences and masks the prosody
discontinuity from independent chunk generation. Each chunk uses
seed + idx so chunks don't sound identical even on repeated nouns.
Example script in examples/streaming_demo.py.
This commit is contained in:
@@ -594,5 +594,88 @@ class SupertonicMLXPipeline:
|
||||
wav = wav.astype(mx.float32)
|
||||
return np.array(wav)[0] # (T_lat × 6 × 512,)
|
||||
|
||||
# ── Streaming ────────────────────────────────────────────────────
|
||||
@staticmethod
|
||||
def _split_for_streaming(text: str, max_chars: int = 220) -> list[str]:
|
||||
"""Split text into chunks at sentence-ending punctuation.
|
||||
|
||||
Each chunk keeps its terminator. Long sentences exceeding ``max_chars``
|
||||
are further split on ``,`` ``;`` ``:`` to keep TTFB low and respect
|
||||
the model's training distribution (it sees medium-length utterances).
|
||||
"""
|
||||
import re
|
||||
# Split on sentence-ending punctuation, retaining it
|
||||
sentences = re.findall(r"[^.!?…]+[.!?…]?", text, flags=re.UNICODE)
|
||||
chunks: list[str] = []
|
||||
for s in sentences:
|
||||
s = s.strip()
|
||||
if not s:
|
||||
continue
|
||||
if len(s) <= max_chars:
|
||||
chunks.append(s)
|
||||
continue
|
||||
# Long sentence — split on secondary punctuation
|
||||
parts = re.findall(r"[^,;:]+[,;:]?", s, flags=re.UNICODE)
|
||||
buf = ""
|
||||
for p in parts:
|
||||
if len(buf) + len(p) <= max_chars:
|
||||
buf += p
|
||||
else:
|
||||
if buf:
|
||||
chunks.append(buf.strip())
|
||||
buf = p
|
||||
if buf:
|
||||
chunks.append(buf.strip())
|
||||
return chunks
|
||||
|
||||
def generate_stream(
|
||||
self,
|
||||
text: str,
|
||||
voice: str = "F1",
|
||||
lang: str = "en",
|
||||
seed: int = 99,
|
||||
n_steps: Optional[int] = None,
|
||||
max_chunk_chars: int = 220,
|
||||
):
|
||||
"""Generator that yields ``(chunk_idx, wav_chunk)`` tuples as chunks are synthesised.
|
||||
|
||||
The text is split at sentence-ending punctuation (``. ! ?``); long
|
||||
sentences are further split at secondary punctuation (``, ; :``) so the
|
||||
first chunk reaches the caller in ~ one VE forward (≈ 30-50 ms on M4).
|
||||
The caller can start playing chunk 0 while subsequent chunks
|
||||
synthesise — TTS speed is x100+ so audio playback never starves.
|
||||
|
||||
Usage:
|
||||
|
||||
for i, wav in pipe.generate_stream("Phrase 1. Phrase 2.", voice="F1", lang="fr"):
|
||||
play_audio(wav) # start playback as soon as chunk 0 arrives
|
||||
|
||||
For non-streaming consumers, use :meth:`SupertonicMLXPipeline.concat_chunks`
|
||||
on the collected list.
|
||||
"""
|
||||
chunks = self._split_for_streaming(text, max_chars=max_chunk_chars)
|
||||
if not chunks:
|
||||
return
|
||||
for idx, chunk in enumerate(chunks):
|
||||
wav = self.generate(chunk, voice=voice, lang=lang, seed=seed + idx, n_steps=n_steps)
|
||||
yield idx, wav
|
||||
|
||||
@staticmethod
|
||||
def concat_chunks(chunks: list[np.ndarray], gap_ms: int = 80,
|
||||
sample_rate: int = SAMPLE_RATE) -> np.ndarray:
|
||||
"""Concatenate streaming chunks with a short silence between to mask
|
||||
the prosody discontinuity that comes from independent generation.
|
||||
|
||||
``gap_ms`` defaults to 80 ms which roughly matches the natural inter-
|
||||
sentence pause in human speech.
|
||||
"""
|
||||
if not chunks:
|
||||
return np.zeros(0, dtype=np.float32)
|
||||
gap = np.zeros(int(sample_rate * gap_ms / 1000), dtype=np.float32)
|
||||
out = [chunks[0]]
|
||||
for c in chunks[1:]:
|
||||
out.extend([gap, c])
|
||||
return np.concatenate(out, axis=0)
|
||||
|
||||
|
||||
__all__ = ["SupertonicMLXPipeline"]
|
||||
|
||||
Reference in New Issue
Block a user