Spaces:

AIDC-AI
/

Marco-Voice-TTS

Running

App Files Files Community

tianfengping.tfp commited on 18 days ago

Commit

1bd43c9

1 Parent(s): 7ae3e9e

check prompt

Browse files

Files changed (1) hide show

app.py +93 -50

app.py CHANGED Viewed

@@ -34,38 +34,54 @@ import numpy
 sys.path.append('third_party/Matcha-TTS')
 os.system('export PYTHONPATH=third_party/Matcha-TTS')
-assets_dir = snapshot_download(
-    repo_id="tienfeng/prompt",
-    repo_type="dataset",
-)
 from huggingface_hub import hf_hub_download
-model_repo_id = "AIDC-AI/Marco-Voice"
-local_model = snapshot_download(
-    repo_id=model_repo_id,
-    repo_type="model"
-    # token=os.getenv("HF_TOKEN")
-)
-local_model_path = os.path.join(local_model, "marco_voice")
-local_model_path_enhenced = os.path.join(local_model, "marco_voice_enhenced")
-logo_path = hf_hub_download(
-    repo_id="tienfeng/prompt",
-    filename="logo2.png",
-    repo_type="dataset",
-)
-logo_path2 = hf_hub_download(
-    repo_id="tienfeng/prompt",
-    filename="logo.png",
-    repo_type="dataset",
-)
-tts_speakerminus = CosyVoiceTTS_speakerminus(model_dir=local_model_path)
-tts_sft = CosyVoiceTTS_speakerminus(model_dir=local_model_path_enhenced)
 text_prompt = {
 "翟佳宁": "这个节目就是把四个男嘉宾，四个女嘉宾放一个大别墅里让他们朝夕相处一整个月，月末选择心动的彼此。",
@@ -140,12 +156,17 @@ os.makedirs("./tmp", exist_ok=True)
 def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text):
     # import pdb;pdb.set_trace()
-    global tts_speakerminus_global
-    if 'tts_speakerminus_global' not in globals():
         print("Loading CosyVoice (speakerminus) model...")
         tts_speakerminus_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path)
     if not ref_audio and not ref_text:
         ref_text = text_prompt.get(speaker, "")
         speaker_audio_name = audio_prompt.get(speaker)
         if speaker_audio_name:
@@ -179,16 +200,20 @@ def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_t
     ref_audio = load_wav(ref_audio, 16000)
     emo = {"Sad": "伤心", "Fearful": "恐惧", "Happy": "快乐", "Surprise": "惊喜", "Angry": "生气", "Jolliest": "戏谑"}
     # key="快乐"
     if key in ["Angry", "Surprise", "Happy"]:
-        emotion_info = torch.load("./emotion_info.pt")["male005"][key]
     elif key in ["Sad"]:
-        emotion_info = torch.load("./emotion_info.pt")["female005"][key]
     elif key in ["Fearful"]:
-        emotion_info = torch.load("./emotion_info.pt")["female003"][key]
     else:
-        emotion_info = torch.load("./emotion_info.pt")["male005"][key]
-    sample_rate, full_audio = inference_zero_shot.inference_zero_shot(
         tts_text,
         prompt_text = ref_text,
         # speaker=speaker,
@@ -215,11 +240,16 @@ def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_t
 def generate_speech_sft(tts_text, speed, speaker, key, ref_audio, ref_text):
     # import pdb;pdb.set_trace()
-    global tts_sft_global
-    if 'tts_sft_global' not in globals():
         print("Loading CosyVoice (SFT enhanced) model...")
         tts_sft_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path_enhenced)
     if not ref_audio and not ref_text:
         ref_text = text_prompt.get(speaker, "")
         speaker_audio_name = audio_prompt.get(speaker)
         if speaker_audio_name:
@@ -252,14 +282,18 @@ def generate_speech_sft(tts_text, speed, speaker, key, ref_audio, ref_text):
     emo = {"Sad": "伤心", "Fearful": "恐惧", "Happy": "快乐", "Surprise": "惊喜", "Angry": "生气", "Jolliest": "戏谑"}
     # key="快乐"
     if key in ["Angry", "Surprise", "Happy"]:
-        emotion_info = torch.load("./emotion_info.pt")["male005"][key]
     elif key in ["Sad"]:
-        emotion_info = torch.load("./emotion_info.pt")["female005"][key]
     elif key in ["Fearful"]:
-        emotion_info = torch.load("./emotion_info.pt")["female003"][key]
     else:
-        emotion_info = torch.load("./emotion_info.pt")["male005"][key]
     sample_rate, full_audio = tts_sft_global.inference_zero_shot(
         tts_text,
@@ -793,17 +827,26 @@ def preload_models():
     """Pre-download models to cache (non-blocking for launch)"""
     import threading
     def _download():
-        print("Pre-downloading models to cache...")
-        snapshot_download(repo_id=model_repo_id, repo_type="model")
-        print("Model pre-download completed.")
     threading.Thread(target=_download, daemon=True).start()
 preload_models()
 if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=10163,
-        share=False,
-        favicon_path=logo_path2
-    )

 sys.path.append('third_party/Matcha-TTS')
 os.system('export PYTHONPATH=third_party/Matcha-TTS')
 from huggingface_hub import hf_hub_download
+# Download assets and logos first (these are small files)
+try:
+    assets_dir = snapshot_download(
+        repo_id="tienfeng/prompt",
+        repo_type="dataset",
+    )
+    logo_path = hf_hub_download(
+        repo_id="tienfeng/prompt",
+        filename="logo2.png",
+        repo_type="dataset",
+    )
+    logo_path2 = hf_hub_download(
+        repo_id="tienfeng/prompt",
+        filename="logo.png",
+        repo_type="dataset",
+    )
+except Exception as e:
+    print(f"Warning: Failed to download assets/logos: {e}")
+    assets_dir = None
+    logo_path = None
+    logo_path2 = None
+# Delay model download to avoid blocking startup
+model_repo_id = "AIDC-AI/Marco-Voice"
+local_model = None
+local_model_path = None
+local_model_path_enhenced = None
+def load_models():
+    """Load models lazily when needed"""
+    global local_model, local_model_path, local_model_path_enhenced
+    if local_model is None:
+        print("Downloading models...")
+        local_model = snapshot_download(
+            repo_id=model_repo_id,
+            repo_type="model"
+            # token=os.getenv("HF_TOKEN")
+        )
+        local_model_path = os.path.join(local_model, "marco_voice")
+        local_model_path_enhenced = os.path.join(local_model, "marco_voice_enhenced")
+        print("Models downloaded successfully")
+# Delay model loading to avoid blocking startup
+# Models will be loaded lazily when first used
+tts_speakerminus = None
+tts_sft = None
 text_prompt = {
 "翟佳宁": "这个节目就是把四个男嘉宾，四个女嘉宾放一个大别墅里让他们朝夕相处一整个月，月末选择心动的彼此。",
 def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text):
     # import pdb;pdb.set_trace()
+    global tts_speakerminus_global, local_model_path
+    # Ensure models are downloaded
+    if local_model_path is None:
+        load_models()
+    if 'tts_speakerminus_global' not in globals() or tts_speakerminus_global is None:
         print("Loading CosyVoice (speakerminus) model...")
         tts_speakerminus_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path)
     if not ref_audio and not ref_text:
+        if audio_prompt_path is None:
+            raise ValueError("Audio prompt path is not available. Please provide reference audio and text.")
         ref_text = text_prompt.get(speaker, "")
         speaker_audio_name = audio_prompt.get(speaker)
         if speaker_audio_name:
     ref_audio = load_wav(ref_audio, 16000)
     emo = {"Sad": "伤心", "Fearful": "恐惧", "Happy": "快乐", "Surprise": "惊喜", "Angry": "生气", "Jolliest": "戏谑"}
     # key="快乐"
+    emotion_file = "./emotion_info.pt"
+    if not os.path.exists(emotion_file):
+        raise FileNotFoundError(f"Emotion info file not found: {emotion_file}. Please ensure this file exists in the workspace.")
+    emotion_data = torch.load(emotion_file)
     if key in ["Angry", "Surprise", "Happy"]:
+        emotion_info = emotion_data["male005"][key]
     elif key in ["Sad"]:
+        emotion_info = emotion_data["female005"][key]
     elif key in ["Fearful"]:
+        emotion_info = emotion_data["female003"][key]
     else:
+        emotion_info = emotion_data["male005"][key]
+    sample_rate, full_audio = tts_speakerminus_global.inference_zero_shot(
         tts_text,
         prompt_text = ref_text,
         # speaker=speaker,
 def generate_speech_sft(tts_text, speed, speaker, key, ref_audio, ref_text):
     # import pdb;pdb.set_trace()
+    global tts_sft_global, local_model_path_enhenced
+    # Ensure models are downloaded
+    if local_model_path_enhenced is None:
+        load_models()
+    if 'tts_sft_global' not in globals() or tts_sft_global is None:
         print("Loading CosyVoice (SFT enhanced) model...")
         tts_sft_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path_enhenced)
     if not ref_audio and not ref_text:
+        if audio_prompt_path is None:
+            raise ValueError("Audio prompt path is not available. Please provide reference audio and text.")
         ref_text = text_prompt.get(speaker, "")
         speaker_audio_name = audio_prompt.get(speaker)
         if speaker_audio_name:
     emo = {"Sad": "伤心", "Fearful": "恐惧", "Happy": "快乐", "Surprise": "惊喜", "Angry": "生气", "Jolliest": "戏谑"}
     # key="快乐"
+    emotion_file = "./emotion_info.pt"
+    if not os.path.exists(emotion_file):
+        raise FileNotFoundError(f"Emotion info file not found: {emotion_file}. Please ensure this file exists in the workspace.")
+    emotion_data = torch.load(emotion_file)
     if key in ["Angry", "Surprise", "Happy"]:
+        emotion_info = emotion_data["male005"][key]
     elif key in ["Sad"]:
+        emotion_info = emotion_data["female005"][key]
     elif key in ["Fearful"]:
+        emotion_info = emotion_data["female003"][key]
     else:
+        emotion_info = emotion_data["male005"][key]
     sample_rate, full_audio = tts_sft_global.inference_zero_shot(
         tts_text,
     """Pre-download models to cache (non-blocking for launch)"""
     import threading
     def _download():
+        try:
+            print("Pre-downloading models to cache...")
+            load_models()
+            print("Model pre-download completed.")
+        except Exception as e:
+            print(f"Warning: Model pre-download failed: {e}. Models will be loaded on first use.")
     threading.Thread(target=_download, daemon=True).start()
+# Start preloading models in background (non-blocking)
 preload_models()
 if __name__ == "__main__":
+    # Use environment variable for port (Hugging Face Spaces uses 7860 by default)
+    server_port = int(os.environ.get("SERVER_PORT", 7860))
+    launch_kwargs = {
+        "server_name": "0.0.0.0",
+        "server_port": server_port,
+        "share": False,
+    }
+    # Only add favicon if it was successfully downloaded
+    if logo_path2 is not None:
+        launch_kwargs["favicon_path"] = logo_path2
+    demo.launch(**launch_kwargs)