Download notebook

In [ ]:

# Install required packages
!pip install --quiet gradio ipywidgets kokoro-onnx onnxruntime requests numpy soundfile

print('✓ Packages installed!')

Gradio demo: Text-to-speech (simple)¶

We'll start with trying to work on a Kokoro text-to-speech engine. It comes with a lot of options! You can see all the voices here.

Why not just the default Space?

In [ ]:

from pathlib import Path

import requests
import soundfile as sf
from kokoro_onnx import Kokoro


print("Downloading Kokoro files...")

model_dir = Path("data/models/kokoro-onnx")
model_dir.mkdir(parents=True, exist_ok=True)

model_file = model_dir / "kokoro-v1.0.int8.onnx"
voices_file = model_dir / "voices-v1.0.bin"

model_url = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.int8.onnx"
voices_url = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"


if not model_file.exists():
    response = requests.get(model_url)
    response.raise_for_status()
    model_file.write_bytes(response.content)

if not voices_file.exists():
    response = requests.get(voices_url)
    response.raise_for_status()
    voices_file.write_bytes(response.content)


print("Initializing text-to-speech...")

kokoro = Kokoro(str(model_file), str(voices_file))

text = "This is a small local text to speech demo."
voice = "af_heart"
language = "en-us"
speed = 1.0

print("Generating audio...")

audio, sample_rate = kokoro.create(
    text,
    voice=voice,
    speed=speed,
    lang=language,
)

output_file = "output.wav"
sf.write(output_file, audio, sample_rate)

print(f"Saved audio to {output_file}")

Want to listen? I supposed you could open the file, but we can also do it in the notebook itself.

In [ ]:

from IPython.display import Audio

Audio("output.wav")

Building the Gradio demo¶

In [1]:

from functools import lru_cache
from pathlib import Path

import gradio as gr
import requests


DEFAULT_TEXT = "This is a small local text to speech demo."

MODEL_DIR = Path("data/models/kokoro-onnx")
MODEL_FILE = MODEL_DIR / "kokoro-v1.0.int8.onnx"
VOICES_FILE = MODEL_DIR / "voices-v1.0.bin"
MODEL_URL = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.int8.onnx"
VOICES_URL = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"

VOICES = {
    "US female - Heart": ("af_heart", "en-us"),
    "US female - Bella": ("af_bella", "en-us"),
    "US female - Nicole": ("af_nicole", "en-us"),
    "US female - Sarah": ("af_sarah", "en-us"),
    "US female - Sky": ("af_sky", "en-us"),
    "US male - Adam": ("am_adam", "en-us"),
    "US male - Fenrir": ("am_fenrir", "en-us"),
    "US male - Michael": ("am_michael", "en-us"),
    "US male - Puck": ("am_puck", "en-us"),
    "UK female - Alice": ("bf_alice", "en-gb"),
    "UK female - Emma": ("bf_emma", "en-gb"),
    "UK female - Isabella": ("bf_isabella", "en-gb"),
    "UK female - Lily": ("bf_lily", "en-gb"),
    "UK male - Daniel": ("bm_daniel", "en-gb"),
    "UK male - Fable": ("bm_fable", "en-gb"),
    "UK male - George": ("bm_george", "en-gb"),
    "UK male - Lewis": ("bm_lewis", "en-gb"),
}


def download_file(url, path):
    if path.exists():
        return

    MODEL_DIR.mkdir(parents=True, exist_ok=True)
    print(f"Downloading {path.name}...")
    response = requests.get(url)
    response.raise_for_status()
    path.write_bytes(response.content)


def download_models():
    download_file(MODEL_URL, MODEL_FILE)
    download_file(VOICES_URL, VOICES_FILE)


@lru_cache
def kokoro():
    from kokoro_onnx import Kokoro

    download_models()
    return Kokoro(str(MODEL_FILE), str(VOICES_FILE))


def speak(text, voice_name, speed):
    text = text.strip()
    if not text:
        raise gr.Error("Enter some text first.")

    voice, language = VOICES[voice_name]
    audio, sample_rate = kokoro().create(text, voice=voice, speed=speed, lang=language)
    return sample_rate, audio


with gr.Blocks(title="Local TTS demo") as demo:
    gr.Markdown("# Local TTS demo")
    with gr.Row():
        with gr.Column():
            text = gr.Textbox(label="Text", value=DEFAULT_TEXT, lines=4)
            voice = gr.Dropdown(label="Voice", choices=list(VOICES), value="US female - Heart")
            speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
            button = gr.Button("Generate", variant="primary")
        with gr.Column():
            output = gr.Audio(label="Output")

    button.click(speak, [text, voice, speed], output)

In [ ]:

import os

os.environ['GRADIO_DEBUG'] = '1'

demo.launch()