Spaces:

ayousanz
/

piper-plus-demo

Running

File size: 12,913 Bytes

#!/usr/bin/env python3
"""
Piper TTS Gradio Demo for Hugging Face Spaces
Supports Japanese and English text-to-speech using ONNX models
"""

import json
import logging

import gradio as gr
import numpy as np
import onnxruntime
from app_imports import ESPEAK_AVAILABLE, PYOPENJTALK_AVAILABLE

# Download models if not present
from download_models import download_models


# Ensure models are downloaded
download_models()


# Import optional dependencies
if PYOPENJTALK_AVAILABLE:
    import pyopenjtalk
if ESPEAK_AVAILABLE:
    from espeak_phonemizer import Phonemizer


# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Model configurations
MODELS = {
    "Japanese (Medium)": {
        "path": "models/ja_JP-test-medium.onnx",
        "config": "models/ja_JP-test-medium.onnx.json",
        "language": "ja",
    },
    "English (Test)": {
        "path": "models/test_voice.onnx",
        "config": "models/test_voice.onnx.json",
        "language": "en",
    },
}

# Basic English word to IPA mapping for common words
# This is a simplified fallback when espeak-ng is not available
ENGLISH_IPA_MAP = {
    "hello": "hɛloʊ",
    "world": "wɜrld",
    "this": "ðɪs",
    "is": "ɪz",
    "a": "ə",
    "test": "tɛst",
    "text": "tɛkst",
    "to": "tu",
    "speech": "spitʃ",
    "demo": "dɛmoʊ",
    "welcome": "wɛlkəm",
    "piper": "paɪpər",
    "tts": "titiɛs",
    "enjoy": "ɛndʒɔɪ",
    "high": "haɪ",
    "quality": "kwɑləti",
    "synthesis": "sɪnθəsɪs",
    "the": "ðə",
    "and": "ænd",
    "for": "fɔr",
    "with": "wɪð",
    "you": "ju",
    "can": "kæn",
    "it": "ɪt",
    "that": "ðæt",
    "have": "hæv",
    "from": "frʌm",
    "or": "ɔr",
    "which": "wɪtʃ",
    "one": "wʌn",
    "would": "wʊd",
    "all": "ɔl",
    "will": "wɪl",
    "there": "ðɛr",
    "say": "seɪ",
    "who": "hu",
    "make": "meɪk",
    "when": "wɛn",
    "time": "taɪm",
    "if": "ɪf",
    "no": "noʊ",
    "way": "weɪ",
    "has": "hæz",
    "yes": "jɛs",
    "good": "gʊd",
    "very": "vɛri",
}

# Japanese multi-character phoneme to Unicode PUA mapping
# This mapping must match the C++ implementation and training data
PHONEME_TO_PUA = {
    # Long vowels
    "a:": "\ue000",
    "i:": "\ue001",
    "u:": "\ue002",
    "e:": "\ue003",
    "o:": "\ue004",
    # Special consonants
    "cl": "\ue005",  # Geminate/glottal stop
    # Palatalized consonants
    "ky": "\ue006",
    "kw": "\ue007",
    "gy": "\ue008",
    "gw": "\ue009",
    "ty": "\ue00a",
    "dy": "\ue00b",
    "py": "\ue00c",
    "by": "\ue00d",
    # Affricates and special sounds
    "ch": "\ue00e",
    "ts": "\ue00f",
    "sh": "\ue010",
    "zy": "\ue011",
    "hy": "\ue012",
    # Palatalized nasals/liquids
    "ny": "\ue013",
    "my": "\ue014",
    "ry": "\ue015",
}


def load_model_config(config_path: str) -> dict:
    """Load model configuration from JSON file"""
    with open(config_path, encoding="utf-8") as f:
        return json.load(f)


def map_phonemes(phonemes: list[str]) -> list[str]:
    """Map multi-character phonemes to Unicode PUA characters"""
    result = []
    for phoneme in phonemes:
        if phoneme in PHONEME_TO_PUA:
            result.append(PHONEME_TO_PUA[phoneme])
        else:
            result.append(phoneme)
    return result


def text_to_phonemes(text: str, language: str) -> list[str]:
    """Convert text to phoneme strings based on language"""

    if language == "ja":
        if PYOPENJTALK_AVAILABLE:
            # Get phonemes from OpenJTalk
            labels = pyopenjtalk.extract_fullcontext(text)
            phonemes = []

            for label in labels:
                # Extract phoneme from label
                if "-" in label and "+" in label:
                    phoneme = label.split("-")[1].split("+")[0]
                    if phoneme not in ["sil", "pau"]:
                        phonemes.append(phoneme)

            # Add sentence markers
            phonemes = ["^"] + phonemes + ["$"]

            # Convert multi-character phonemes to Unicode PUA
            phonemes = map_phonemes(phonemes)
        else:
            logger.warning("pyopenjtalk not available, using fallback")
            # Simple fallback - just use dummy phonemes
            phonemes = ["^"] + list("aiueo") * 5 + ["$"]

    elif ESPEAK_AVAILABLE:  # English
        phonemizer = Phonemizer("en-us")
        phoneme_str = phonemizer.phonemize(text)
        # Convert phoneme string to list
        phonemes = ["^"] + list(phoneme_str.replace(" ", "")) + ["$"]
    else:
        logger.warning("espeak_phonemizer not available, using IPA fallback")
        # IPA-based fallback for better English pronunciation
        words = text.lower().split()
        phonemes = ["^"]

        for i, word in enumerate(words):
            # Add space between words
            if i > 0:
                phonemes.append(" ")

            # Remove punctuation from word
            clean_word = "".join(c for c in word if c.isalpha())

            if clean_word in ENGLISH_IPA_MAP:
                # Use IPA mapping if available
                ipa = ENGLISH_IPA_MAP[clean_word]
                phonemes.extend(list(ipa))
            else:
                # Fall back to character-by-character for unknown words
                phonemes.extend(list(clean_word))

        phonemes.append("$")

    return phonemes


def phonemes_to_ids(phonemes: list[str], config: dict) -> list[int]:
    """Convert phonemes to model input IDs"""
    phoneme_id_map = config.get("phoneme_id_map", {})

    ids = []
    for phoneme in phonemes:
        if phoneme in phoneme_id_map:
            ids.extend(phoneme_id_map[phoneme])
        else:
            # Use pad token for unknown phonemes
            ids.append(0)

    return ids


def synthesize_speech(
    text: str,
    model_name: str,
    speaker_id: int = 0,
    length_scale: float = 1.0,
    noise_scale: float = 0.667,
    noise_w: float = 0.8,
) -> tuple[int, np.ndarray]:
    """Generate speech from text using selected model"""

    if not text.strip():
        raise gr.Error("Please enter some text")

    if model_name not in MODELS:
        raise gr.Error("Invalid model selected")

    model_info = MODELS[model_name]
    config = load_model_config(model_info["config"])

    # Convert text to phoneme IDs
    phonemes = text_to_phonemes(text, model_info["language"])
    phoneme_ids = phonemes_to_ids(phonemes, config)

    if not phoneme_ids:
        raise gr.Error("Failed to convert text to phonemes")

    # Load ONNX model
    sess_options = onnxruntime.SessionOptions()
    sess_options.inter_op_num_threads = 1
    sess_options.intra_op_num_threads = 1

    try:
        model = onnxruntime.InferenceSession(
            model_info["path"],
            sess_options=sess_options,
            providers=["CPUExecutionProvider"],
        )
    except Exception as e:
        logger.error(f"Failed to load model: {e}")
        raise gr.Error(f"Failed to load model: {str(e)}") from e

    # Prepare inputs
    text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
    text_lengths = np.array([text_array.shape[1]], dtype=np.int64)
    scales = np.array([noise_scale, length_scale, noise_w], dtype=np.float32)

    # Handle speaker ID for multi-speaker models
    sid = None
    if config.get("num_speakers", 1) > 1:
        sid = np.array([speaker_id], dtype=np.int64)

    # Run inference
    try:
        inputs = {
            "input": text_array,
            "input_lengths": text_lengths,
            "scales": scales,
        }

        if sid is not None:
            inputs["sid"] = sid

        audio = model.run(None, inputs)[0]

        # Remove batch and channel dimensions
        audio = audio.squeeze()

        # Convert to int16
        audio = np.clip(audio * 32767, -32768, 32767).astype(np.int16)

        sample_rate = config.get("audio", {}).get("sample_rate", 22050)

        return sample_rate, audio

    except Exception as e:
        logger.error(f"Inference failed: {e}")
        raise gr.Error(f"Failed to generate speech: {str(e)}") from e


def create_interface():
    """Create Gradio interface"""
    with gr.Blocks(title="Piper TTS Demo") as interface:
        gr.Markdown("""
            # 🎙️ Piper TTS Demo

            High-quality text-to-speech synthesis supporting Japanese and English.

            This demo uses ONNX models for fast CPU inference.
            """)

        with gr.Row():
            with gr.Column(scale=2):
                model_dropdown = gr.Dropdown(
                    choices=list(MODELS.keys()),
                    label="Select Model",
                    value=list(MODELS.keys())[0],
                )

                text_input = gr.Textbox(
                    label="Text to synthesize",
                    placeholder="Enter text here...",
                    lines=3,
                )

                # Advanced Settings without Accordion (flattened)
                gr.Markdown("### Advanced Settings")

                speaker_id = gr.Number(
                    label="Speaker ID (for multi-speaker models)",
                    value=0,
                    precision=0,
                    minimum=0,
                    maximum=10,
                )

                length_scale = gr.Slider(
                    label="Speed (Lower = faster speech)",
                    minimum=0.5,
                    maximum=2.0,
                    value=1.0,
                    step=0.1,
                )

                noise_scale = gr.Slider(
                    label="Expressiveness",
                    minimum=0.0,
                    maximum=1.0,
                    value=0.667,
                    step=0.01,
                )

                noise_w = gr.Slider(
                    label="Phoneme Duration Variance",
                    minimum=0.0,
                    maximum=1.0,
                    value=0.8,
                    step=0.01,
                )

            synthesize_btn = gr.Button("Generate Speech", variant="primary")

        with gr.Column(scale=2):
            audio_output = gr.Audio(
                label="Generated Speech",
                type="numpy",
                autoplay=True,
            )

            gr.Markdown("""
                ### Tips:
                - Japanese model expects hiragana/kanji text
                - English model works with standard text
                - Adjust speed for faster/slower speech
                - Higher expressiveness = more natural variation
                """)

        # Examples
        gr.Examples(
            examples=[
                ["こんにちは、世界！今日はいい天気ですね。", "Japanese (Medium)"],
                [
                    "おはようございます。本日の会議は午後3時から始まります。",
                    "Japanese (Medium)",
                ],
                ["Hello world! This is a text to speech demo.", "English (Test)"],
                [
                    "Welcome to Piper TTS. Enjoy high quality speech synthesis.",
                    "English (Test)",
                ],
            ],
            inputs=[text_input, model_dropdown],
        )

        # Event handlers
        synthesize_btn.click(
            fn=synthesize_speech,
            inputs=[
                text_input,
                model_dropdown,
                speaker_id,
                length_scale,
                noise_scale,
                noise_w,
            ],
            outputs=audio_output,
        )

    return interface


def create_minimal_interface():
    """Create a minimal fallback interface if main interface fails"""
    with gr.Blocks(title="Piper TTS Demo") as interface:
        gr.Markdown("# 🎙️ Piper TTS Demo")

        text_input = gr.Textbox(
            label="Text to synthesize",
            placeholder="Enter text here...",
            lines=3,
        )

        model_dropdown = gr.Dropdown(
            choices=list(MODELS.keys()),
            label="Select Model",
            value=list(MODELS.keys())[0],
        )

        synthesize_btn = gr.Button("Generate Speech", variant="primary")

        audio_output = gr.Audio(
            label="Generated Speech",
            type="numpy",
        )

        synthesize_btn.click(
            fn=lambda text, model: synthesize_speech(text, model, 0, 1.0, 0.667, 0.8),
            inputs=[text_input, model_dropdown],
            outputs=audio_output,
        )

    return interface


# Create and launch the app
# Move interface creation inside main block to avoid context issues
interface = None

if __name__ == "__main__":
    # Create and launch interface
    interface = create_interface()
    # Launch with minimal configuration for Hugging Face Spaces
    interface.launch()