Spaces:

MahmoudElsamadony
/

TTV

Running

App Files Files Community

MahmoudElsamadony commited on 10 days ago

Commit

a3cf4a0

1 Parent(s): 60c1f8a

Update with new models

Browse files

Files changed (1) hide show

app.py +183 -27

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ from transformers import (
 )
 from datasets import load_dataset
 from scipy.io.wavfile import write as wav_write
-from huggingface_hub import InferenceClient, snapshot_download
 from huggingface_hub.utils import HfHubHTTPError
 # Pre-selected Arabic-focused TTS models on Hugging Face (verified public repos)
@@ -26,14 +26,8 @@ ARABIC_TTS_MODELS = {
         "hosted": False,
         "description": "Official MMS checkpoint for Modern Standard Arabic",
     },
-    "MMS (Arabela) — facebook/mms-tts-arl": {
-        "repo_id": "facebook/mms-tts-arl",
-        "engine": "vits",
-        "hosted": False,
-        "description": "MMS model for the Arabic (Arabela) locale",
-    },
-    "VITS (Community) — wasmdashai/vits-ar": {
-        "repo_id": "wasmdashai/vits-ar",
         "engine": "vits",
         "hosted": False,
         "description": "Community-trained VITS voice focused on Arabic",
@@ -44,14 +38,17 @@ ARABIC_TTS_MODELS = {
         "hosted": False,
         "description": "MBZUAI SpeechT5 fine-tune for Classical Arabic",
     },
-    "Kokoro (Arabic) — hexgrad/Kokoro-82M": {
-        "repo_id": "hexgrad/Kokoro-82M",
-        "engine": "kokoro",
         "hosted": False,
-        "description": "Kokoro 82M multilingual voice with Arabic support (requires espeak-ng)",
-        "lang_code": "a",
-        "default_voice": "af_heart",
-        "sample_rate": 24000,
     },
 }
@@ -124,6 +121,47 @@ with st.sidebar.expander("Model assets", expanded=False):
             st.sidebar.error(f"Download failed: {dl_err}")
             logger.exception("Download failed for %s", model_id)
 if LOG_FILE.exists():
     with open(LOG_FILE, "rb") as log_file:
         st.sidebar.download_button(
@@ -170,19 +208,29 @@ sample_rate = st.sidebar.number_input("Sample rate", value=16000, min_value=8000
 @st.cache_resource(show_spinner=False)
 def load_local_model(repo_id: str, cache_dir: str):
-    model = VitsModel.from_pretrained(repo_id, cache_dir=cache_dir)
-    tokenizer = AutoTokenizer.from_pretrained(repo_id, cache_dir=cache_dir)
-    return model, tokenizer
 @st.cache_resource(show_spinner=False)
 def load_speecht5_bundle(repo_id: str, cache_dir: str):
-    processor = SpeechT5Processor.from_pretrained(repo_id, cache_dir=cache_dir)
-    model = SpeechT5ForTextToSpeech.from_pretrained(repo_id, cache_dir=cache_dir)
-    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=cache_dir)
-    embeddings = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-    speaker_embedding = torch.tensor(embeddings[0]["xvector"]).unsqueeze(0)
-    return processor, model, vocoder, speaker_embedding
 @st.cache_resource(show_spinner=False)
@@ -190,6 +238,74 @@ def load_kokoro_pipeline(lang_code: str):
     return KPipeline(lang_code=lang_code)
 def ensure_valid_tokens(token_batch: dict):
     seq_len = token_batch["input_ids"].shape[-1]
     if seq_len < 2:
@@ -258,6 +374,44 @@ if generate:
                         raise RuntimeError("Kokoro pipeline returned no audio. Try a different voice or text.")
                     waveform = np.concatenate(audio_chunks).astype(np.float32)
                     sr = model_meta.get("sample_rate", 24000)
                 else:
                     raise RuntimeError(f"Engine {model_meta['engine']} not supported locally")
@@ -279,9 +433,11 @@ if generate:
                 logger.exception("Local inference failed for %s", model_id)
                 if hosted_available:
                     should_run_hosted = True
-                    status_placeholder.warning("Local inference فشل، سيتم استخدام واجهة Hugging Face المستضافة تلقائيًا.")
                 else:
-                    status_placeholder.error("Local inference failed. راجع السجلات أو جرّب نموذجًا آخر.")
         if not success and should_run_hosted and hosted_available:
             try:

 )
 from datasets import load_dataset
 from scipy.io.wavfile import write as wav_write
+from huggingface_hub import InferenceClient, snapshot_download, hf_hub_download
 from huggingface_hub.utils import HfHubHTTPError
 # Pre-selected Arabic-focused TTS models on Hugging Face (verified public repos)
         "hosted": False,
         "description": "Official MMS checkpoint for Modern Standard Arabic",
     },
+    "VITS (Community) — wasmdashai/vits-ar-sa-A": {
+        "repo_id": "wasmdashai/vits-ar-sa-A",
         "engine": "vits",
         "hosted": False,
         "description": "Community-trained VITS voice focused on Arabic",
         "hosted": False,
         "description": "MBZUAI SpeechT5 fine-tune for Classical Arabic",
     },
+    "Saudi TTS — AhmedEladl/saudi-tts": {
+        "repo_id": "AhmedEladl/saudi-tts",
+        "engine": "xtts",
         "hosted": False,
+        "description": "Coqui XTTS-style Saudi Arabic model (.pth checkpoint). Provide local paths below.",
+    },
+    "XTTS v2 — coqui/XTTS-v2": {
+        "repo_id": "coqui/XTTS-v2",
+        "engine": "xtts",
+        "hosted": False,
+        "description": "Official Coqui XTTS v2. Use local snapshot and speaker WAV; supports synthesize().",
     },
 }
             st.sidebar.error(f"Download failed: {dl_err}")
             logger.exception("Download failed for %s", model_id)
+# Remember last chosen download dir for defaults
+try:
+    st.session_state["_last_download_dir"] = download_dir
+except Exception:
+    pass
+# XTTS-specific path inputs now that download_dir is defined
+xtts_config_path = None
+xtts_vocab_path = None
+xtts_checkpoint_dir = None
+xtts_speaker_wav = None
+xtts_temperature = 0.75
+if model_meta["engine"] == "xtts":
+    with st.sidebar.expander("XTTS local paths", expanded=True):
+        base = Path(st.session_state.get("_last_download_dir", DEFAULT_DOWNLOAD_DIR)).expanduser()
+        xtts_config_path = st.text_input(
+            "config.json path",
+            value=str(base / "config.json"),
+            help="Absolute or relative path to XTTS config.json",
+        )
+        xtts_vocab_path = st.text_input(
+            "vocab.json path",
+            value=str(base / "vocab.json"),
+            help="Optional: path to vocab.json (if required by your checkpoint)",
+        )
+        xtts_checkpoint_dir = st.text_input(
+            "Checkpoint directory",
+            value=str(base),
+            help="Directory containing the model .pth checkpoint",
+        )
+        xtts_speaker_wav = st.text_input(
+            "Speaker WAV path",
+            value=str(base / "speaker.wav"),
+            help="Path to a short reference WAV for voice cloning",
+        )
+        xtts_temperature = st.slider("XTTS temperature", 0.1, 1.2, 0.75, 0.05)
+    with st.sidebar.expander("XTTS options", expanded=False):
+        xtts_language = st.text_input("Language code", value="ar", help="e.g., ar, en, fr…")
+        xtts_gpt_cond_len = st.slider("GPT conditioning length", 1, 10, 3, 1)
+        xtts_use_synthesize = st.checkbox("Use synthesize() if available", value=True)
 if LOG_FILE.exists():
     with open(LOG_FILE, "rb") as log_file:
         st.sidebar.download_button(
 @st.cache_resource(show_spinner=False)
 def load_local_model(repo_id: str, cache_dir: str):
+    try:
+        model = VitsModel.from_pretrained(repo_id, cache_dir=cache_dir)
+        tokenizer = AutoTokenizer.from_pretrained(repo_id, cache_dir=cache_dir)
+        return model, tokenizer
+    except OSError as missing_weights:
+        raise RuntimeError(
+            f"Model {repo_id} does not ship a supported checkpoint (pytorch_model.bin/model.safetensors)."
+            " Download the raw .pth manually and convert it to HF format, or pick another model."
+        ) from missing_weights
 @st.cache_resource(show_spinner=False)
 def load_speecht5_bundle(repo_id: str, cache_dir: str):
+    try:
+        processor = SpeechT5Processor.from_pretrained(repo_id, cache_dir=cache_dir)
+        model = SpeechT5ForTextToSpeech.from_pretrained(repo_id, cache_dir=cache_dir)
+        vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=cache_dir)
+        speaker_embedding = _load_speecht5_speaker_embedding(cache_dir)
+        return processor, model, vocoder, speaker_embedding
+    except ImportError as imp_err:
+        raise RuntimeError(
+            "SpeechT5 needs optional deps (sentencepiece). Run `pip install sentencepiece` then restart the app."
+        ) from imp_err
 @st.cache_resource(show_spinner=False)
     return KPipeline(lang_code=lang_code)
+def _load_speecht5_speaker_embedding(cache_dir: str) -> torch.Tensor:
+    """Load a speaker embedding for SpeechT5 without using dataset scripts.
+    If remote assets are unavailable, return a neutral 512-dim embedding.
+    """
+    # Try a known xvector file if available (no trust_remote_code)
+    try:
+        xvector_path = hf_hub_download(
+            repo_id="Matthijs/cmu-arctic-xvectors",
+            filename="validation/000000.xvector.npy",
+            repo_type="dataset",
+            cache_dir=cache_dir,
+        )
+        arr = np.load(xvector_path)
+        vector = torch.from_numpy(arr)
+        if vector.ndim == 1:
+            vector = vector.unsqueeze(0)
+        return vector
+    except Exception as err:
+        logger.warning("Speaker xvector file not accessible (%s); using neutral embedding.", err)
+    # Fallback: neutral speaker embedding (512 dims expected by SpeechT5)
+    neutral = torch.zeros((1, 512), dtype=torch.float32)
+    return neutral
+@st.cache_resource(show_spinner=False)
+def load_xtts_model(config_path: str, checkpoint_dir: str, vocab_path: str | None, device: str):
+    try:
+        from TTS.tts.configs.xtts_config import XttsConfig
+        from TTS.tts.models.xtts import Xtts
+    except ImportError as e:
+        raise RuntimeError(
+            "XTTS requires the Coqui TTS library. Install via `pip install TTS` and restart the app."
+        ) from e
+    cfg_path = Path(config_path)
+    voc_path = Path(vocab_path) if vocab_path else None
+    ckpt_dir = Path(checkpoint_dir)
+    if not cfg_path.exists():
+        raise RuntimeError(f"XTTS config.json not found at {cfg_path}")
+    if voc_path is not None and not voc_path.exists():
+        raise RuntimeError(f"XTTS vocab.json not found at {voc_path}")
+    if not ckpt_dir.exists():
+        raise RuntimeError(f"XTTS checkpoint directory not found at {ckpt_dir}")
+    config = XttsConfig()
+    config.load_json(str(cfg_path))
+    model = Xtts.init_from_config(config)
+    if voc_path is not None:
+        model.load_checkpoint(
+            config,
+            checkpoint_dir=str(ckpt_dir),
+            eval=True,
+            vocab_path=str(voc_path),
+        )
+    else:
+        model.load_checkpoint(
+            config,
+            checkpoint_dir=str(ckpt_dir),
+            eval=True,
+        )
+    if device == "cuda":
+        model.cuda()
+    model.eval()
+    return model
 def ensure_valid_tokens(token_batch: dict):
     seq_len = token_batch["input_ids"].shape[-1]
     if seq_len < 2:
                         raise RuntimeError("Kokoro pipeline returned no audio. Try a different voice or text.")
                     waveform = np.concatenate(audio_chunks).astype(np.float32)
                     sr = model_meta.get("sample_rate", 24000)
+                elif model_meta["engine"] == "xtts":
+                    model = load_xtts_model(
+                        str(Path(xtts_config_path).expanduser()),
+                        str(Path(xtts_checkpoint_dir).expanduser()),
+                        str(Path(xtts_vocab_path).expanduser()),
+                        device,
+                    )
+                    spk_path = Path(xtts_speaker_wav).expanduser()
+                    if not spk_path.exists():
+                        raise RuntimeError(f"Speaker WAV not found at {spk_path}")
+                    try:
+                        if 'xtts_use_synthesize' in locals() and xtts_use_synthesize and hasattr(model, 'synthesize'):
+                            out = model.synthesize(
+                                text,
+                                model.config,
+                                speaker_wav=str(spk_path),
+                                gpt_cond_len=int(xtts_gpt_cond_len),
+                                language=xtts_language,
+                                temperature=float(xtts_temperature),
+                            )
+                            wav = out.get("wav") if isinstance(out, dict) else out
+                            waveform = np.asarray(wav, dtype=np.float32)
+                            sr = 24000
+                        else:
+                            gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[str(spk_path)])
+                            out = model.inference(
+                                text,
+                                xtts_language,
+                                gpt_cond_latent,
+                                speaker_embedding,
+                                temperature=float(xtts_temperature),
+                            )
+                            waveform = np.asarray(out["wav"], dtype=np.float32)
+                            sr = 24000
+                    except Exception as xtts_err:
+                        raise RuntimeError(
+                            f"XTTS inference failed. Ensure config, vocab, checkpoint (.pth) and speaker WAV are correct. Error: {xtts_err}"
+                        ) from xtts_err
                 else:
                     raise RuntimeError(f"Engine {model_meta['engine']} not supported locally")
                 logger.exception("Local inference failed for %s", model_id)
                 if hosted_available:
                     should_run_hosted = True
+                    status_placeholder.warning(
+                        f"Local inference فشل ({local_err}). سيتم استخدام واجهة Hugging Face المستضافة تلقائيًا عند توفرها."
+                    )
                 else:
+                    status_placeholder.error(f"Local inference failed: {local_err}. راجع السجلات أو جرّب نموذجًا آخر.")
         if not success and should_run_hosted and hosted_available:
             try: