# Install required packages
!pip install --quiet gradio ipywidgets kokoro-onnx onnxruntime requests numpy soundfile torch transformers sentencepiece
print('✓ Packages installed!')
Now let's throw all the features in there.
from functools import lru_cache
from pathlib import Path
import gradio as gr
import numpy as np
import requests
import torch
DEFAULT_TEXT = (
"This is a local text to speech demo. Kokoro has named voices; "
"MMS/VITS has one checkpoint per language."
)
MODEL_DIR = Path("data/models/kokoro-onnx")
KOKORO_MODEL_FILE = MODEL_DIR / "kokoro-v1.0.int8.onnx"
KOKORO_VOICES_FILE = MODEL_DIR / "voices-v1.0.bin"
KOKORO_MODEL_URL = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.int8.onnx"
KOKORO_VOICES_URL = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
KOKORO_VOICES = {
"English (US)": {
"af_heart - female": ("af_heart", "en-us"),
"af_bella - female": ("af_bella", "en-us"),
"af_nicole - female": ("af_nicole", "en-us"),
"af_sarah - female": ("af_sarah", "en-us"),
"am_adam - male": ("am_adam", "en-us"),
"am_fenrir - male": ("am_fenrir", "en-us"),
"am_michael - male": ("am_michael", "en-us"),
},
"English (UK)": {
"bf_emma - female": ("bf_emma", "en-gb"),
"bf_isabella - female": ("bf_isabella", "en-gb"),
"bm_daniel - male": ("bm_daniel", "en-gb"),
"bm_george - male": ("bm_george", "en-gb"),
},
}
MMS_MODELS = {
"English": "facebook/mms-tts-eng",
"Spanish": "facebook/mms-tts-spa",
"French": "facebook/mms-tts-fra",
"German": "facebook/mms-tts-deu",
"Hindi": "facebook/mms-tts-hin",
"Japanese": "facebook/mms-tts-jpn",
}
def download_file(url, path):
if path.exists():
return
MODEL_DIR.mkdir(parents=True, exist_ok=True)
print(f"Downloading {path.name}...")
response = requests.get(url)
response.raise_for_status()
path.write_bytes(response.content)
def download_kokoro_files():
download_file(KOKORO_MODEL_URL, KOKORO_MODEL_FILE)
download_file(KOKORO_VOICES_URL, KOKORO_VOICES_FILE)
def device():
return "cuda" if torch.cuda.is_available() else "cpu"
def audio_tuple(rate, audio):
return rate, np.asarray(audio, dtype=np.float32).squeeze()
def require_text(text):
text = text.strip()
if not text:
raise gr.Error("Enter some text first.")
return text
@lru_cache
def kokoro_model():
from kokoro_onnx import Kokoro
download_kokoro_files()
return Kokoro(str(KOKORO_MODEL_FILE), str(KOKORO_VOICES_FILE))
def kokoro_voices_for(language_label):
return list(KOKORO_VOICES[language_label])
def update_kokoro_language(language_label):
voices = kokoro_voices_for(language_label)
return gr.update(choices=voices, value=voices[0])
def run_kokoro(text, language_label, voice_label, speed):
text = require_text(text)
voice, language = KOKORO_VOICES[language_label][voice_label]
audio, sample_rate = kokoro_model().create(text, voice=voice, speed=speed, lang=language)
return (sample_rate, audio), f"Kokoro ONNX voice: {voice}"
@lru_cache
def mms_model(model_id):
from transformers import AutoTokenizer, VitsModel
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = VitsModel.from_pretrained(model_id).to(device())
return tokenizer, model
def run_mms(text, language_label, speaking_rate, noise_scale, seed):
text = require_text(text)
torch.manual_seed(int(seed))
model_id = MMS_MODELS[language_label]
tokenizer, model = mms_model(model_id)
model.speaking_rate = speaking_rate
model.noise_scale = noise_scale
inputs = tokenizer(text, return_tensors="pt").to(device())
with torch.inference_mode():
audio = model(**inputs).waveform
return audio_tuple(model.config.sampling_rate, audio.cpu().numpy()), model_id
with gr.Blocks(title="Modern local TTS demo") as demo:
gr.Markdown("# Modern local TTS demo")
text = gr.Textbox(label="Text", value=DEFAULT_TEXT, lines=5)
output = gr.Audio(label="Output")
notes = gr.Markdown()
with gr.Tabs():
with gr.Tab("Kokoro ONNX"):
kokoro_language = gr.Dropdown(
label="Language",
choices=list(KOKORO_VOICES),
value="English (US)",
)
kokoro_voice = gr.Dropdown(
label="Voice",
choices=kokoro_voices_for("English (US)"),
value="af_heart - female",
)
kokoro_speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
kokoro_button = gr.Button("Generate with Kokoro", variant="primary")
with gr.Tab("MMS / VITS"):
mms_language = gr.Dropdown(
label="Language checkpoint",
choices=list(MMS_MODELS),
value="English",
)
mms_rate = gr.Slider(0.6, 1.6, value=1.0, step=0.05, label="Speaking rate")
mms_noise = gr.Slider(0.0, 1.2, value=0.667, step=0.05, label="Noise")
mms_seed = gr.Number(label="Seed", value=0, precision=0)
mms_button = gr.Button("Generate with MMS", variant="primary")
kokoro_language.change(update_kokoro_language, kokoro_language, kokoro_voice)
kokoro_button.click(run_kokoro, [text, kokoro_language, kokoro_voice, kokoro_speed], [output, notes])
mms_button.click(run_mms, [text, mms_language, mms_rate, mms_noise, mms_seed], [output, notes])
import os
os.environ['GRADIO_DEBUG'] = '1'
demo.launch()