Download notebook

In [ ]:

# Install required packages
!pip install --quiet gradio ipywidgets kokoro-onnx onnxruntime requests numpy soundfile torch transformers sentencepiece

print('✓ Packages installed!')

Gradio demo: Text-to-speech (fancy)¶

Now let's throw all the features in there.

In [1]:

from functools import lru_cache
from pathlib import Path

import gradio as gr
import numpy as np
import requests
import torch


DEFAULT_TEXT = (
    "This is a local text to speech demo. Kokoro has named voices; "
    "MMS/VITS has one checkpoint per language."
)

MODEL_DIR = Path("data/models/kokoro-onnx")
KOKORO_MODEL_FILE = MODEL_DIR / "kokoro-v1.0.int8.onnx"
KOKORO_VOICES_FILE = MODEL_DIR / "voices-v1.0.bin"
KOKORO_MODEL_URL = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.int8.onnx"
KOKORO_VOICES_URL = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"

KOKORO_VOICES = {
    "English (US)": {
        "af_heart - female": ("af_heart", "en-us"),
        "af_bella - female": ("af_bella", "en-us"),
        "af_nicole - female": ("af_nicole", "en-us"),
        "af_sarah - female": ("af_sarah", "en-us"),
        "am_adam - male": ("am_adam", "en-us"),
        "am_fenrir - male": ("am_fenrir", "en-us"),
        "am_michael - male": ("am_michael", "en-us"),
    },
    "English (UK)": {
        "bf_emma - female": ("bf_emma", "en-gb"),
        "bf_isabella - female": ("bf_isabella", "en-gb"),
        "bm_daniel - male": ("bm_daniel", "en-gb"),
        "bm_george - male": ("bm_george", "en-gb"),
    },
}

MMS_MODELS = {
    "English": "facebook/mms-tts-eng",
    "Spanish": "facebook/mms-tts-spa",
    "French": "facebook/mms-tts-fra",
    "German": "facebook/mms-tts-deu",
    "Hindi": "facebook/mms-tts-hin",
    "Japanese": "facebook/mms-tts-jpn",
}


def download_file(url, path):
    if path.exists():
        return

    MODEL_DIR.mkdir(parents=True, exist_ok=True)
    print(f"Downloading {path.name}...")
    response = requests.get(url)
    response.raise_for_status()
    path.write_bytes(response.content)


def download_kokoro_files():
    download_file(KOKORO_MODEL_URL, KOKORO_MODEL_FILE)
    download_file(KOKORO_VOICES_URL, KOKORO_VOICES_FILE)


def device():
    return "cuda" if torch.cuda.is_available() else "cpu"


def audio_tuple(rate, audio):
    return rate, np.asarray(audio, dtype=np.float32).squeeze()


def require_text(text):
    text = text.strip()
    if not text:
        raise gr.Error("Enter some text first.")
    return text


@lru_cache
def kokoro_model():
    from kokoro_onnx import Kokoro

    download_kokoro_files()
    return Kokoro(str(KOKORO_MODEL_FILE), str(KOKORO_VOICES_FILE))


def kokoro_voices_for(language_label):
    return list(KOKORO_VOICES[language_label])


def update_kokoro_language(language_label):
    voices = kokoro_voices_for(language_label)
    return gr.update(choices=voices, value=voices[0])


def run_kokoro(text, language_label, voice_label, speed):
    text = require_text(text)
    voice, language = KOKORO_VOICES[language_label][voice_label]
    audio, sample_rate = kokoro_model().create(text, voice=voice, speed=speed, lang=language)
    return (sample_rate, audio), f"Kokoro ONNX voice: {voice}"


@lru_cache
def mms_model(model_id):
    from transformers import AutoTokenizer, VitsModel

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = VitsModel.from_pretrained(model_id).to(device())
    return tokenizer, model


def run_mms(text, language_label, speaking_rate, noise_scale, seed):
    text = require_text(text)
    torch.manual_seed(int(seed))
    model_id = MMS_MODELS[language_label]
    tokenizer, model = mms_model(model_id)
    model.speaking_rate = speaking_rate
    model.noise_scale = noise_scale
    inputs = tokenizer(text, return_tensors="pt").to(device())

    with torch.inference_mode():
        audio = model(**inputs).waveform

    return audio_tuple(model.config.sampling_rate, audio.cpu().numpy()), model_id


with gr.Blocks(title="Modern local TTS demo") as demo:
    gr.Markdown("# Modern local TTS demo")

    text = gr.Textbox(label="Text", value=DEFAULT_TEXT, lines=5)
    output = gr.Audio(label="Output")
    notes = gr.Markdown()

    with gr.Tabs():
        with gr.Tab("Kokoro ONNX"):
            kokoro_language = gr.Dropdown(
                label="Language",
                choices=list(KOKORO_VOICES),
                value="English (US)",
            )
            kokoro_voice = gr.Dropdown(
                label="Voice",
                choices=kokoro_voices_for("English (US)"),
                value="af_heart - female",
            )
            kokoro_speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
            kokoro_button = gr.Button("Generate with Kokoro", variant="primary")

        with gr.Tab("MMS / VITS"):
            mms_language = gr.Dropdown(
                label="Language checkpoint",
                choices=list(MMS_MODELS),
                value="English",
            )
            mms_rate = gr.Slider(0.6, 1.6, value=1.0, step=0.05, label="Speaking rate")
            mms_noise = gr.Slider(0.0, 1.2, value=0.667, step=0.05, label="Noise")
            mms_seed = gr.Number(label="Seed", value=0, precision=0)
            mms_button = gr.Button("Generate with MMS", variant="primary")

    kokoro_language.change(update_kokoro_language, kokoro_language, kokoro_voice)
    kokoro_button.click(run_kokoro, [text, kokoro_language, kokoro_voice, kokoro_speed], [output, notes])
    mms_button.click(run_mms, [text, mms_language, mms_rate, mms_noise, mms_seed], [output, notes])

In [ ]:

import os

os.environ['GRADIO_DEBUG'] = '1'

demo.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.

/Users/soma/Library/CloudStorage/Dropbox/Soma/Curriculum/2026-dataharvest/ai-experimentation/.venv/lib/python3.11/site-packages/gradio/processing_utils.py:698: UserWarning: Trying to convert audio automatically from float32 to 16-bit int format.
  warnings.warn(warning.format(data.dtype))

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/47.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/762 [00:00<?, ?it/s]

/Users/soma/Library/CloudStorage/Dropbox/Soma/Curriculum/2026-dataharvest/ai-experimentation/.venv/lib/python3.11/site-packages/gradio/processing_utils.py:698: UserWarning: Trying to convert audio automatically from float32 to 16-bit int format.
  warnings.warn(warning.format(data.dtype))