48 lines
1.6 KiB
Python
Executable File
48 lines
1.6 KiB
Python
Executable File
#!/usr/bin/env python
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from granite_speech_plus_mlx import GraniteSpeechPlusPipeline
|
|
from granite_speech_plus_mlx.pipeline import DEFAULT_MODEL
|
|
from granite_speech_plus_mlx.prompts import GRANITE_SYSTEM_PROMPT, PROMPT_MODES
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="Transcribe audio with Granite Speech Plus MLX.")
|
|
parser.add_argument("audio")
|
|
parser.add_argument("--model", default=DEFAULT_MODEL)
|
|
parser.add_argument("--output", default=None)
|
|
parser.add_argument("--chunk-seconds", type=float, default=300.0)
|
|
parser.add_argument("--overlap-seconds", type=float, default=2.0)
|
|
parser.add_argument("--prompt-mode", choices=sorted(PROMPT_MODES), default="asr")
|
|
parser.add_argument("--repetition-penalty", type=float, default=1.2)
|
|
parser.add_argument("--max-tokens", type=int, default=4096)
|
|
parser.add_argument("--system-prompt", default=GRANITE_SYSTEM_PROMPT)
|
|
parser.add_argument("--verbose", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
pipe = GraniteSpeechPlusPipeline.from_pretrained(
|
|
args.model,
|
|
chunk_seconds=args.chunk_seconds,
|
|
overlap_seconds=args.overlap_seconds,
|
|
repetition_penalty=args.repetition_penalty,
|
|
max_tokens=args.max_tokens,
|
|
system_prompt=args.system_prompt or None,
|
|
verbose=args.verbose,
|
|
)
|
|
text = pipe.transcribe(args.audio, prompt_mode=args.prompt_mode)
|
|
|
|
if args.output:
|
|
Path(args.output).write_text(text + "\n", encoding="utf-8")
|
|
else:
|
|
print(text)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|
|
|