#!/usr/bin/env python3 """ Piper TTS Gradio Demo for Hugging Face Spaces Supports Japanese and English text-to-speech using ONNX models """ import json import logging import gradio as gr import numpy as np import onnxruntime from app_imports import ESPEAK_AVAILABLE, PYOPENJTALK_AVAILABLE # Download models if not present from download_models import download_models # Ensure models are downloaded download_models() # Import optional dependencies if PYOPENJTALK_AVAILABLE: import pyopenjtalk if ESPEAK_AVAILABLE: from espeak_phonemizer import Phonemizer # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Model configurations MODELS = { "Japanese (Medium)": { "path": "models/ja_JP-test-medium.onnx", "config": "models/ja_JP-test-medium.onnx.json", "language": "ja", }, "English (Test)": { "path": "models/test_voice.onnx", "config": "models/test_voice.onnx.json", "language": "en", }, } # Basic English word to IPA mapping for common words # This is a simplified fallback when espeak-ng is not available ENGLISH_IPA_MAP = { "hello": "hɛloʊ", "world": "wɜrld", "this": "ðɪs", "is": "ɪz", "a": "ə", "test": "tɛst", "text": "tɛkst", "to": "tu", "speech": "spitʃ", "demo": "dɛmoʊ", "welcome": "wɛlkəm", "piper": "paɪpər", "tts": "titiɛs", "enjoy": "ɛndʒɔɪ", "high": "haɪ", "quality": "kwɑləti", "synthesis": "sɪnθəsɪs", "the": "ðə", "and": "ænd", "for": "fɔr", "with": "wɪð", "you": "ju", "can": "kæn", "it": "ɪt", "that": "ðæt", "have": "hæv", "from": "frʌm", "or": "ɔr", "which": "wɪtʃ", "one": "wʌn", "would": "wʊd", "all": "ɔl", "will": "wɪl", "there": "ðɛr", "say": "seɪ", "who": "hu", "make": "meɪk", "when": "wɛn", "time": "taɪm", "if": "ɪf", "no": "noʊ", "way": "weɪ", "has": "hæz", "yes": "jɛs", "good": "gʊd", "very": "vɛri", } # Japanese multi-character phoneme to Unicode PUA mapping # This mapping must match the C++ implementation and training data PHONEME_TO_PUA = { # Long vowels "a:": "\ue000", "i:": "\ue001", "u:": "\ue002", "e:": "\ue003", "o:": "\ue004", # Special consonants "cl": "\ue005", # Geminate/glottal stop # Palatalized consonants "ky": "\ue006", "kw": "\ue007", "gy": "\ue008", "gw": "\ue009", "ty": "\ue00a", "dy": "\ue00b", "py": "\ue00c", "by": "\ue00d", # Affricates and special sounds "ch": "\ue00e", "ts": "\ue00f", "sh": "\ue010", "zy": "\ue011", "hy": "\ue012", # Palatalized nasals/liquids "ny": "\ue013", "my": "\ue014", "ry": "\ue015", } def load_model_config(config_path: str) -> dict: """Load model configuration from JSON file""" with open(config_path, encoding="utf-8") as f: return json.load(f) def map_phonemes(phonemes: list[str]) -> list[str]: """Map multi-character phonemes to Unicode PUA characters""" result = [] for phoneme in phonemes: if phoneme in PHONEME_TO_PUA: result.append(PHONEME_TO_PUA[phoneme]) else: result.append(phoneme) return result def text_to_phonemes(text: str, language: str) -> list[str]: """Convert text to phoneme strings based on language""" if language == "ja": if PYOPENJTALK_AVAILABLE: # Get phonemes from OpenJTalk labels = pyopenjtalk.extract_fullcontext(text) phonemes = [] for label in labels: # Extract phoneme from label if "-" in label and "+" in label: phoneme = label.split("-")[1].split("+")[0] if phoneme not in ["sil", "pau"]: phonemes.append(phoneme) # Add sentence markers phonemes = ["^"] + phonemes + ["$"] # Convert multi-character phonemes to Unicode PUA phonemes = map_phonemes(phonemes) else: logger.warning("pyopenjtalk not available, using fallback") # Simple fallback - just use dummy phonemes phonemes = ["^"] + list("aiueo") * 5 + ["$"] elif ESPEAK_AVAILABLE: # English phonemizer = Phonemizer("en-us") phoneme_str = phonemizer.phonemize(text) # Convert phoneme string to list phonemes = ["^"] + list(phoneme_str.replace(" ", "")) + ["$"] else: logger.warning("espeak_phonemizer not available, using IPA fallback") # IPA-based fallback for better English pronunciation words = text.lower().split() phonemes = ["^"] for i, word in enumerate(words): # Add space between words if i > 0: phonemes.append(" ") # Remove punctuation from word clean_word = "".join(c for c in word if c.isalpha()) if clean_word in ENGLISH_IPA_MAP: # Use IPA mapping if available ipa = ENGLISH_IPA_MAP[clean_word] phonemes.extend(list(ipa)) else: # Fall back to character-by-character for unknown words phonemes.extend(list(clean_word)) phonemes.append("$") return phonemes def phonemes_to_ids(phonemes: list[str], config: dict) -> list[int]: """Convert phonemes to model input IDs""" phoneme_id_map = config.get("phoneme_id_map", {}) ids = [] for phoneme in phonemes: if phoneme in phoneme_id_map: ids.extend(phoneme_id_map[phoneme]) else: # Use pad token for unknown phonemes ids.append(0) return ids def synthesize_speech( text: str, model_name: str, speaker_id: int = 0, length_scale: float = 1.0, noise_scale: float = 0.667, noise_w: float = 0.8, ) -> tuple[int, np.ndarray]: """Generate speech from text using selected model""" if not text.strip(): raise gr.Error("Please enter some text") if model_name not in MODELS: raise gr.Error("Invalid model selected") model_info = MODELS[model_name] config = load_model_config(model_info["config"]) # Convert text to phoneme IDs phonemes = text_to_phonemes(text, model_info["language"]) phoneme_ids = phonemes_to_ids(phonemes, config) if not phoneme_ids: raise gr.Error("Failed to convert text to phonemes") # Load ONNX model sess_options = onnxruntime.SessionOptions() sess_options.inter_op_num_threads = 1 sess_options.intra_op_num_threads = 1 try: model = onnxruntime.InferenceSession( model_info["path"], sess_options=sess_options, providers=["CPUExecutionProvider"], ) except Exception as e: logger.error(f"Failed to load model: {e}") raise gr.Error(f"Failed to load model: {str(e)}") from e # Prepare inputs text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0) text_lengths = np.array([text_array.shape[1]], dtype=np.int64) scales = np.array([noise_scale, length_scale, noise_w], dtype=np.float32) # Handle speaker ID for multi-speaker models sid = None if config.get("num_speakers", 1) > 1: sid = np.array([speaker_id], dtype=np.int64) # Run inference try: inputs = { "input": text_array, "input_lengths": text_lengths, "scales": scales, } if sid is not None: inputs["sid"] = sid audio = model.run(None, inputs)[0] # Remove batch and channel dimensions audio = audio.squeeze() # Convert to int16 audio = np.clip(audio * 32767, -32768, 32767).astype(np.int16) sample_rate = config.get("audio", {}).get("sample_rate", 22050) return sample_rate, audio except Exception as e: logger.error(f"Inference failed: {e}") raise gr.Error(f"Failed to generate speech: {str(e)}") from e def create_interface(): """Create Gradio interface""" with gr.Blocks(title="Piper TTS Demo") as interface: gr.Markdown(""" # 🎙️ Piper TTS Demo High-quality text-to-speech synthesis supporting Japanese and English. This demo uses ONNX models for fast CPU inference. """) with gr.Row(): with gr.Column(scale=2): model_dropdown = gr.Dropdown( choices=list(MODELS.keys()), label="Select Model", value=list(MODELS.keys())[0], ) text_input = gr.Textbox( label="Text to synthesize", placeholder="Enter text here...", lines=3, ) # Advanced Settings without Accordion (flattened) gr.Markdown("### Advanced Settings") speaker_id = gr.Number( label="Speaker ID (for multi-speaker models)", value=0, precision=0, minimum=0, maximum=10, ) length_scale = gr.Slider( label="Speed (Lower = faster speech)", minimum=0.5, maximum=2.0, value=1.0, step=0.1, ) noise_scale = gr.Slider( label="Expressiveness", minimum=0.0, maximum=1.0, value=0.667, step=0.01, ) noise_w = gr.Slider( label="Phoneme Duration Variance", minimum=0.0, maximum=1.0, value=0.8, step=0.01, ) synthesize_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(scale=2): audio_output = gr.Audio( label="Generated Speech", type="numpy", autoplay=True, ) gr.Markdown(""" ### Tips: - Japanese model expects hiragana/kanji text - English model works with standard text - Adjust speed for faster/slower speech - Higher expressiveness = more natural variation """) # Examples gr.Examples( examples=[ ["こんにちは、世界!今日はいい天気ですね。", "Japanese (Medium)"], [ "おはようございます。本日の会議は午後3時から始まります。", "Japanese (Medium)", ], ["Hello world! This is a text to speech demo.", "English (Test)"], [ "Welcome to Piper TTS. Enjoy high quality speech synthesis.", "English (Test)", ], ], inputs=[text_input, model_dropdown], ) # Event handlers synthesize_btn.click( fn=synthesize_speech, inputs=[ text_input, model_dropdown, speaker_id, length_scale, noise_scale, noise_w, ], outputs=audio_output, ) return interface def create_minimal_interface(): """Create a minimal fallback interface if main interface fails""" with gr.Blocks(title="Piper TTS Demo") as interface: gr.Markdown("# 🎙️ Piper TTS Demo") text_input = gr.Textbox( label="Text to synthesize", placeholder="Enter text here...", lines=3, ) model_dropdown = gr.Dropdown( choices=list(MODELS.keys()), label="Select Model", value=list(MODELS.keys())[0], ) synthesize_btn = gr.Button("Generate Speech", variant="primary") audio_output = gr.Audio( label="Generated Speech", type="numpy", ) synthesize_btn.click( fn=lambda text, model: synthesize_speech(text, model, 0, 1.0, 0.667, 0.8), inputs=[text_input, model_dropdown], outputs=audio_output, ) return interface # Create and launch the app # Move interface creation inside main block to avoid context issues interface = None if __name__ == "__main__": # Create and launch interface interface = create_interface() # Launch with minimal configuration for Hugging Face Spaces interface.launch()