tianfengping.tfp commited on
Commit
1bd43c9
·
1 Parent(s): 7ae3e9e

check prompt

Browse files
Files changed (1) hide show
  1. app.py +93 -50
app.py CHANGED
@@ -34,38 +34,54 @@ import numpy
34
  sys.path.append('third_party/Matcha-TTS')
35
  os.system('export PYTHONPATH=third_party/Matcha-TTS')
36
 
37
- assets_dir = snapshot_download(
38
- repo_id="tienfeng/prompt",
39
- repo_type="dataset",
40
- )
41
-
42
  from huggingface_hub import hf_hub_download
43
 
44
- model_repo_id = "AIDC-AI/Marco-Voice"
45
- local_model = snapshot_download(
46
- repo_id=model_repo_id,
47
- repo_type="model"
48
- # token=os.getenv("HF_TOKEN")
49
- )
50
-
51
- local_model_path = os.path.join(local_model, "marco_voice")
52
- local_model_path_enhenced = os.path.join(local_model, "marco_voice_enhenced")
53
-
54
-
55
- logo_path = hf_hub_download(
56
- repo_id="tienfeng/prompt",
57
- filename="logo2.png",
58
- repo_type="dataset",
59
- )
 
 
 
 
 
60
 
61
- logo_path2 = hf_hub_download(
62
- repo_id="tienfeng/prompt",
63
- filename="logo.png",
64
- repo_type="dataset",
65
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- tts_speakerminus = CosyVoiceTTS_speakerminus(model_dir=local_model_path)
68
- tts_sft = CosyVoiceTTS_speakerminus(model_dir=local_model_path_enhenced)
 
 
69
 
70
  text_prompt = {
71
  "翟佳宁": "这个节目就是把四个男嘉宾,四个女嘉宾放一个大别墅里让他们朝夕相处一整个月,月末选择心动的彼此。",
@@ -140,12 +156,17 @@ os.makedirs("./tmp", exist_ok=True)
140
 
141
  def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text):
142
  # import pdb;pdb.set_trace()
143
- global tts_speakerminus_global
144
- if 'tts_speakerminus_global' not in globals():
 
 
 
145
  print("Loading CosyVoice (speakerminus) model...")
146
  tts_speakerminus_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path)
147
 
148
  if not ref_audio and not ref_text:
 
 
149
  ref_text = text_prompt.get(speaker, "")
150
  speaker_audio_name = audio_prompt.get(speaker)
151
  if speaker_audio_name:
@@ -179,16 +200,20 @@ def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_t
179
  ref_audio = load_wav(ref_audio, 16000)
180
  emo = {"Sad": "伤心", "Fearful": "恐惧", "Happy": "快乐", "Surprise": "惊喜", "Angry": "生气", "Jolliest": "戏谑"}
181
  # key="快乐"
 
 
 
 
182
  if key in ["Angry", "Surprise", "Happy"]:
183
- emotion_info = torch.load("./emotion_info.pt")["male005"][key]
184
  elif key in ["Sad"]:
185
- emotion_info = torch.load("./emotion_info.pt")["female005"][key]
186
  elif key in ["Fearful"]:
187
- emotion_info = torch.load("./emotion_info.pt")["female003"][key]
188
  else:
189
- emotion_info = torch.load("./emotion_info.pt")["male005"][key]
190
 
191
- sample_rate, full_audio = inference_zero_shot.inference_zero_shot(
192
  tts_text,
193
  prompt_text = ref_text,
194
  # speaker=speaker,
@@ -215,11 +240,16 @@ def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_t
215
 
216
  def generate_speech_sft(tts_text, speed, speaker, key, ref_audio, ref_text):
217
  # import pdb;pdb.set_trace()
218
- global tts_sft_global
219
- if 'tts_sft_global' not in globals():
 
 
 
220
  print("Loading CosyVoice (SFT enhanced) model...")
221
  tts_sft_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path_enhenced)
222
  if not ref_audio and not ref_text:
 
 
223
  ref_text = text_prompt.get(speaker, "")
224
  speaker_audio_name = audio_prompt.get(speaker)
225
  if speaker_audio_name:
@@ -252,14 +282,18 @@ def generate_speech_sft(tts_text, speed, speaker, key, ref_audio, ref_text):
252
 
253
  emo = {"Sad": "伤心", "Fearful": "恐惧", "Happy": "快乐", "Surprise": "惊喜", "Angry": "生气", "Jolliest": "戏谑"}
254
  # key="快乐"
 
 
 
 
255
  if key in ["Angry", "Surprise", "Happy"]:
256
- emotion_info = torch.load("./emotion_info.pt")["male005"][key]
257
  elif key in ["Sad"]:
258
- emotion_info = torch.load("./emotion_info.pt")["female005"][key]
259
  elif key in ["Fearful"]:
260
- emotion_info = torch.load("./emotion_info.pt")["female003"][key]
261
  else:
262
- emotion_info = torch.load("./emotion_info.pt")["male005"][key]
263
 
264
  sample_rate, full_audio = tts_sft_global.inference_zero_shot(
265
  tts_text,
@@ -793,17 +827,26 @@ def preload_models():
793
  """Pre-download models to cache (non-blocking for launch)"""
794
  import threading
795
  def _download():
796
- print("Pre-downloading models to cache...")
797
- snapshot_download(repo_id=model_repo_id, repo_type="model")
798
- print("Model pre-download completed.")
 
 
 
799
  threading.Thread(target=_download, daemon=True).start()
800
 
 
801
  preload_models()
802
 
803
  if __name__ == "__main__":
804
- demo.launch(
805
- server_name="0.0.0.0",
806
- server_port=10163,
807
- share=False,
808
- favicon_path=logo_path2
809
- )
 
 
 
 
 
 
34
  sys.path.append('third_party/Matcha-TTS')
35
  os.system('export PYTHONPATH=third_party/Matcha-TTS')
36
 
 
 
 
 
 
37
  from huggingface_hub import hf_hub_download
38
 
39
+ # Download assets and logos first (these are small files)
40
+ try:
41
+ assets_dir = snapshot_download(
42
+ repo_id="tienfeng/prompt",
43
+ repo_type="dataset",
44
+ )
45
+ logo_path = hf_hub_download(
46
+ repo_id="tienfeng/prompt",
47
+ filename="logo2.png",
48
+ repo_type="dataset",
49
+ )
50
+ logo_path2 = hf_hub_download(
51
+ repo_id="tienfeng/prompt",
52
+ filename="logo.png",
53
+ repo_type="dataset",
54
+ )
55
+ except Exception as e:
56
+ print(f"Warning: Failed to download assets/logos: {e}")
57
+ assets_dir = None
58
+ logo_path = None
59
+ logo_path2 = None
60
 
61
+ # Delay model download to avoid blocking startup
62
+ model_repo_id = "AIDC-AI/Marco-Voice"
63
+ local_model = None
64
+ local_model_path = None
65
+ local_model_path_enhenced = None
66
+
67
+ def load_models():
68
+ """Load models lazily when needed"""
69
+ global local_model, local_model_path, local_model_path_enhenced
70
+ if local_model is None:
71
+ print("Downloading models...")
72
+ local_model = snapshot_download(
73
+ repo_id=model_repo_id,
74
+ repo_type="model"
75
+ # token=os.getenv("HF_TOKEN")
76
+ )
77
+ local_model_path = os.path.join(local_model, "marco_voice")
78
+ local_model_path_enhenced = os.path.join(local_model, "marco_voice_enhenced")
79
+ print("Models downloaded successfully")
80
 
81
+ # Delay model loading to avoid blocking startup
82
+ # Models will be loaded lazily when first used
83
+ tts_speakerminus = None
84
+ tts_sft = None
85
 
86
  text_prompt = {
87
  "翟佳宁": "这个节目就是把四个男嘉宾,四个女嘉宾放一个大别墅里让他们朝夕相处一整个月,月末选择心动的彼此。",
 
156
 
157
  def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text):
158
  # import pdb;pdb.set_trace()
159
+ global tts_speakerminus_global, local_model_path
160
+ # Ensure models are downloaded
161
+ if local_model_path is None:
162
+ load_models()
163
+ if 'tts_speakerminus_global' not in globals() or tts_speakerminus_global is None:
164
  print("Loading CosyVoice (speakerminus) model...")
165
  tts_speakerminus_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path)
166
 
167
  if not ref_audio and not ref_text:
168
+ if audio_prompt_path is None:
169
+ raise ValueError("Audio prompt path is not available. Please provide reference audio and text.")
170
  ref_text = text_prompt.get(speaker, "")
171
  speaker_audio_name = audio_prompt.get(speaker)
172
  if speaker_audio_name:
 
200
  ref_audio = load_wav(ref_audio, 16000)
201
  emo = {"Sad": "伤心", "Fearful": "恐惧", "Happy": "快乐", "Surprise": "惊喜", "Angry": "生气", "Jolliest": "戏谑"}
202
  # key="快乐"
203
+ emotion_file = "./emotion_info.pt"
204
+ if not os.path.exists(emotion_file):
205
+ raise FileNotFoundError(f"Emotion info file not found: {emotion_file}. Please ensure this file exists in the workspace.")
206
+ emotion_data = torch.load(emotion_file)
207
  if key in ["Angry", "Surprise", "Happy"]:
208
+ emotion_info = emotion_data["male005"][key]
209
  elif key in ["Sad"]:
210
+ emotion_info = emotion_data["female005"][key]
211
  elif key in ["Fearful"]:
212
+ emotion_info = emotion_data["female003"][key]
213
  else:
214
+ emotion_info = emotion_data["male005"][key]
215
 
216
+ sample_rate, full_audio = tts_speakerminus_global.inference_zero_shot(
217
  tts_text,
218
  prompt_text = ref_text,
219
  # speaker=speaker,
 
240
 
241
  def generate_speech_sft(tts_text, speed, speaker, key, ref_audio, ref_text):
242
  # import pdb;pdb.set_trace()
243
+ global tts_sft_global, local_model_path_enhenced
244
+ # Ensure models are downloaded
245
+ if local_model_path_enhenced is None:
246
+ load_models()
247
+ if 'tts_sft_global' not in globals() or tts_sft_global is None:
248
  print("Loading CosyVoice (SFT enhanced) model...")
249
  tts_sft_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path_enhenced)
250
  if not ref_audio and not ref_text:
251
+ if audio_prompt_path is None:
252
+ raise ValueError("Audio prompt path is not available. Please provide reference audio and text.")
253
  ref_text = text_prompt.get(speaker, "")
254
  speaker_audio_name = audio_prompt.get(speaker)
255
  if speaker_audio_name:
 
282
 
283
  emo = {"Sad": "伤心", "Fearful": "恐惧", "Happy": "快乐", "Surprise": "惊喜", "Angry": "生气", "Jolliest": "戏谑"}
284
  # key="快乐"
285
+ emotion_file = "./emotion_info.pt"
286
+ if not os.path.exists(emotion_file):
287
+ raise FileNotFoundError(f"Emotion info file not found: {emotion_file}. Please ensure this file exists in the workspace.")
288
+ emotion_data = torch.load(emotion_file)
289
  if key in ["Angry", "Surprise", "Happy"]:
290
+ emotion_info = emotion_data["male005"][key]
291
  elif key in ["Sad"]:
292
+ emotion_info = emotion_data["female005"][key]
293
  elif key in ["Fearful"]:
294
+ emotion_info = emotion_data["female003"][key]
295
  else:
296
+ emotion_info = emotion_data["male005"][key]
297
 
298
  sample_rate, full_audio = tts_sft_global.inference_zero_shot(
299
  tts_text,
 
827
  """Pre-download models to cache (non-blocking for launch)"""
828
  import threading
829
  def _download():
830
+ try:
831
+ print("Pre-downloading models to cache...")
832
+ load_models()
833
+ print("Model pre-download completed.")
834
+ except Exception as e:
835
+ print(f"Warning: Model pre-download failed: {e}. Models will be loaded on first use.")
836
  threading.Thread(target=_download, daemon=True).start()
837
 
838
+ # Start preloading models in background (non-blocking)
839
  preload_models()
840
 
841
  if __name__ == "__main__":
842
+ # Use environment variable for port (Hugging Face Spaces uses 7860 by default)
843
+ server_port = int(os.environ.get("SERVER_PORT", 7860))
844
+ launch_kwargs = {
845
+ "server_name": "0.0.0.0",
846
+ "server_port": server_port,
847
+ "share": False,
848
+ }
849
+ # Only add favicon if it was successfully downloaded
850
+ if logo_path2 is not None:
851
+ launch_kwargs["favicon_path"] = logo_path2
852
+ demo.launch(**launch_kwargs)