Spaces:
Configuration error
Configuration error
Update webui.py
Browse files
webui.py
CHANGED
|
@@ -66,6 +66,7 @@ def change_instruction(mode_checkbox_group):
|
|
| 66 |
@spaces.GPU
|
| 67 |
def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
|
| 68 |
seed, stream, speed):
|
|
|
|
| 69 |
if prompt_wav_upload is not None:
|
| 70 |
prompt_wav = prompt_wav_upload
|
| 71 |
elif prompt_wav_record is not None:
|
|
@@ -76,31 +77,31 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
|
|
| 76 |
if mode_checkbox_group in ['自然语言控制']:
|
| 77 |
if get_cosyvoice().frontend.instruct is False:
|
| 78 |
gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir))
|
| 79 |
-
|
| 80 |
if instruct_text == '':
|
| 81 |
gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
|
| 82 |
-
|
| 83 |
if prompt_wav is not None or prompt_text != '':
|
| 84 |
gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
|
| 85 |
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
| 86 |
if mode_checkbox_group in ['跨语种复刻']:
|
| 87 |
if get_cosyvoice().frontend.instruct is True:
|
| 88 |
gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir))
|
| 89 |
-
|
| 90 |
if instruct_text != '':
|
| 91 |
gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
|
| 92 |
if prompt_wav is None:
|
| 93 |
gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
|
| 94 |
-
|
| 95 |
gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言')
|
| 96 |
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
|
| 97 |
if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
|
| 98 |
if prompt_wav is None:
|
| 99 |
gr.Warning('prompt音频为空,您是否忘记输入prompt音频?')
|
| 100 |
-
|
| 101 |
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
|
| 102 |
gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
|
| 103 |
-
|
| 104 |
# sft mode only use sft_dropdown
|
| 105 |
if mode_checkbox_group in ['预训练音色']:
|
| 106 |
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
|
|
@@ -109,7 +110,7 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
|
|
| 109 |
if mode_checkbox_group in ['3s极速复刻']:
|
| 110 |
if prompt_text == '':
|
| 111 |
gr.Warning('prompt文本为空,您是否忘记输入prompt文本?')
|
| 112 |
-
|
| 113 |
if instruct_text != '':
|
| 114 |
gr.Info('您正在使用3s极速复刻模式,预训练音色/instruct文本会被忽略!')
|
| 115 |
|
|
@@ -117,24 +118,24 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
|
|
| 117 |
logging.info('get sft inference request')
|
| 118 |
set_all_random_seed(seed)
|
| 119 |
for i in get_cosyvoice().inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
|
| 120 |
-
|
| 121 |
elif mode_checkbox_group == '3s极速复刻':
|
| 122 |
logging.info('get zero_shot inference request')
|
| 123 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 124 |
set_all_random_seed(seed)
|
| 125 |
for i in get_cosyvoice().inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 126 |
-
|
| 127 |
elif mode_checkbox_group == '跨语种复刻':
|
| 128 |
logging.info('get cross_lingual inference request')
|
| 129 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 130 |
set_all_random_seed(seed)
|
| 131 |
for i in get_cosyvoice().inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 132 |
-
|
| 133 |
else:
|
| 134 |
logging.info('get instruct inference request')
|
| 135 |
set_all_random_seed(seed)
|
| 136 |
for i in get_cosyvoice().inference_instruct(tts_text, sft_dropdown, instruct_text, stream=stream, speed=speed):
|
| 137 |
-
|
| 138 |
|
| 139 |
# SDK模型下载
|
| 140 |
import platform
|
|
@@ -214,7 +215,7 @@ with gr.Blocks() as demo:
|
|
| 214 |
|
| 215 |
generate_button = gr.Button("生成音频")
|
| 216 |
|
| 217 |
-
audio_output = gr.Audio(label="合成音频", autoplay=True, streaming=
|
| 218 |
|
| 219 |
seed_button.click(generate_seed, inputs=[], outputs=seed)
|
| 220 |
generate_button.click(generate_audio,
|
|
|
|
| 66 |
@spaces.GPU
|
| 67 |
def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
|
| 68 |
seed, stream, speed):
|
| 69 |
+
stream=False
|
| 70 |
if prompt_wav_upload is not None:
|
| 71 |
prompt_wav = prompt_wav_upload
|
| 72 |
elif prompt_wav_record is not None:
|
|
|
|
| 77 |
if mode_checkbox_group in ['自然语言控制']:
|
| 78 |
if get_cosyvoice().frontend.instruct is False:
|
| 79 |
gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir))
|
| 80 |
+
return (target_sr, default_data)
|
| 81 |
if instruct_text == '':
|
| 82 |
gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
|
| 83 |
+
return (target_sr, default_data)
|
| 84 |
if prompt_wav is not None or prompt_text != '':
|
| 85 |
gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
|
| 86 |
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
| 87 |
if mode_checkbox_group in ['跨语种复刻']:
|
| 88 |
if get_cosyvoice().frontend.instruct is True:
|
| 89 |
gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir))
|
| 90 |
+
return (target_sr, default_data)
|
| 91 |
if instruct_text != '':
|
| 92 |
gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
|
| 93 |
if prompt_wav is None:
|
| 94 |
gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
|
| 95 |
+
return (target_sr, default_data)
|
| 96 |
gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言')
|
| 97 |
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
|
| 98 |
if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
|
| 99 |
if prompt_wav is None:
|
| 100 |
gr.Warning('prompt音频为空,您是否忘记输入prompt音频?')
|
| 101 |
+
return (target_sr, default_data)
|
| 102 |
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
|
| 103 |
gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
|
| 104 |
+
return (target_sr, default_data)
|
| 105 |
# sft mode only use sft_dropdown
|
| 106 |
if mode_checkbox_group in ['预训练音色']:
|
| 107 |
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
|
|
|
|
| 110 |
if mode_checkbox_group in ['3s极速复刻']:
|
| 111 |
if prompt_text == '':
|
| 112 |
gr.Warning('prompt文本为空,您是否忘记输入prompt文本?')
|
| 113 |
+
return (target_sr, default_data)
|
| 114 |
if instruct_text != '':
|
| 115 |
gr.Info('您正在使用3s极速复刻模式,预训练音色/instruct文本会被忽略!')
|
| 116 |
|
|
|
|
| 118 |
logging.info('get sft inference request')
|
| 119 |
set_all_random_seed(seed)
|
| 120 |
for i in get_cosyvoice().inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
|
| 121 |
+
return (target_sr, i['tts_speech'].numpy().flatten())
|
| 122 |
elif mode_checkbox_group == '3s极速复刻':
|
| 123 |
logging.info('get zero_shot inference request')
|
| 124 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 125 |
set_all_random_seed(seed)
|
| 126 |
for i in get_cosyvoice().inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 127 |
+
return (target_sr, i['tts_speech'].numpy().flatten())
|
| 128 |
elif mode_checkbox_group == '跨语种复刻':
|
| 129 |
logging.info('get cross_lingual inference request')
|
| 130 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 131 |
set_all_random_seed(seed)
|
| 132 |
for i in get_cosyvoice().inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 133 |
+
return (target_sr, i['tts_speech'].numpy().flatten())
|
| 134 |
else:
|
| 135 |
logging.info('get instruct inference request')
|
| 136 |
set_all_random_seed(seed)
|
| 137 |
for i in get_cosyvoice().inference_instruct(tts_text, sft_dropdown, instruct_text, stream=stream, speed=speed):
|
| 138 |
+
return (target_sr, i['tts_speech'].numpy().flatten())
|
| 139 |
|
| 140 |
# SDK模型下载
|
| 141 |
import platform
|
|
|
|
| 215 |
|
| 216 |
generate_button = gr.Button("生成音频")
|
| 217 |
|
| 218 |
+
audio_output = gr.Audio(label="合成音频", autoplay=True, streaming=False)
|
| 219 |
|
| 220 |
seed_button.click(generate_seed, inputs=[], outputs=seed)
|
| 221 |
generate_button.click(generate_audio,
|