Spaces:
Running
on
L4
Running
on
L4
update words
Browse files
app.py
CHANGED
@@ -59,9 +59,9 @@ from cosyvoice.cli.cosyvoice import CosyVoice2
|
|
59 |
from cosyvoice.utils.file_utils import load_wav, logging
|
60 |
from cosyvoice.utils.common import set_all_random_seed
|
61 |
|
62 |
-
inference_mode_list = ['3s
|
63 |
-
instruct_dict = {'3s
|
64 |
-
'
|
65 |
stream_mode_list = [('No', False), ('Yes', True)]
|
66 |
max_val = 0.8
|
67 |
|
@@ -107,25 +107,25 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
|
|
107 |
else:
|
108 |
prompt_wav = None
|
109 |
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
|
110 |
-
if mode_checkbox_group in ['
|
111 |
if instruct_text == '':
|
112 |
-
gr.Warning('You are using
|
113 |
yield (target_sr, default_data)
|
114 |
if prompt_wav is None:
|
115 |
-
gr.Info('You are using
|
116 |
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
117 |
-
if mode_checkbox_group in ['Cross-lingual
|
118 |
if cosyvoice.frontend.instruct is True:
|
119 |
-
gr.Warning('You are using the cross-lingual
|
120 |
yield (target_sr, default_data)
|
121 |
if instruct_text != '':
|
122 |
-
gr.Info('You are using the cross-lingual
|
123 |
if prompt_wav is None:
|
124 |
-
gr.Warning('You are using the cross-lingual
|
125 |
yield (target_sr, default_data)
|
126 |
-
gr.Info('You are using the cross-lingual
|
127 |
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
|
128 |
-
if mode_checkbox_group in ['3s
|
129 |
if prompt_wav is None:
|
130 |
gr.Warning('Empty prompt found, please check the prompt text.')
|
131 |
yield (target_sr, default_data)
|
@@ -137,12 +137,12 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
|
|
137 |
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
|
138 |
gr.Info('You are using Pretrained Voice mode. Pretrained Voice/Instruct will be ingnored.')
|
139 |
# zero_shot mode only use prompt_wav prompt text
|
140 |
-
if mode_checkbox_group in ['3s
|
141 |
if prompt_text == '':
|
142 |
gr.Warning('Empty prompt found, please check the prompt text.')
|
143 |
yield (target_sr, default_data)
|
144 |
if instruct_text != '':
|
145 |
-
gr.Info('You are using 3s
|
146 |
info = torchaudio.info(prompt_wav)
|
147 |
if info.num_frames / info.sample_rate > 10:
|
148 |
gr.Warning('Please use prompt audio shorter than 10s.')
|
@@ -153,13 +153,13 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
|
|
153 |
set_all_random_seed(seed)
|
154 |
for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
|
155 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
156 |
-
elif mode_checkbox_group == '3s
|
157 |
logging.info('get zero_shot inference request')
|
158 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
159 |
set_all_random_seed(seed)
|
160 |
for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
161 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
162 |
-
elif mode_checkbox_group == 'Cross-lingual
|
163 |
logging.info('get cross_lingual inference request')
|
164 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
165 |
set_all_random_seed(seed)
|
|
|
59 |
from cosyvoice.utils.file_utils import load_wav, logging
|
60 |
from cosyvoice.utils.common import set_all_random_seed
|
61 |
|
62 |
+
inference_mode_list = ['3s Voice Clone', 'Instructed Voice Generation']
|
63 |
+
instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button',
|
64 |
+
'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
|
65 |
stream_mode_list = [('No', False), ('Yes', True)]
|
66 |
max_val = 0.8
|
67 |
|
|
|
107 |
else:
|
108 |
prompt_wav = None
|
109 |
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
|
110 |
+
if mode_checkbox_group in ['Instructed Voice Generation']:
|
111 |
if instruct_text == '':
|
112 |
+
gr.Warning('You are using Instructed Voice Generation mode, please input the instruct.')
|
113 |
yield (target_sr, default_data)
|
114 |
if prompt_wav is None:
|
115 |
+
gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
|
116 |
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
117 |
+
if mode_checkbox_group in ['Cross-lingual Clone']:
|
118 |
if cosyvoice.frontend.instruct is True:
|
119 |
+
gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
|
120 |
yield (target_sr, default_data)
|
121 |
if instruct_text != '':
|
122 |
+
gr.Info('You are using the cross-lingual Clone mode. The instruct text will be ignored.')
|
123 |
if prompt_wav is None:
|
124 |
+
gr.Warning('You are using the cross-lingual Clone mode. Please provide the prompt audio.')
|
125 |
yield (target_sr, default_data)
|
126 |
+
gr.Info('You are using the cross-lingual Clone mode. Please ensure that the synthesis text and prompt text are in different languages.')
|
127 |
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
|
128 |
+
if mode_checkbox_group in ['3s Voice Clone', 'Cross-lingual Clone']:
|
129 |
if prompt_wav is None:
|
130 |
gr.Warning('Empty prompt found, please check the prompt text.')
|
131 |
yield (target_sr, default_data)
|
|
|
137 |
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
|
138 |
gr.Info('You are using Pretrained Voice mode. Pretrained Voice/Instruct will be ingnored.')
|
139 |
# zero_shot mode only use prompt_wav prompt text
|
140 |
+
if mode_checkbox_group in ['3s Voice Clone']:
|
141 |
if prompt_text == '':
|
142 |
gr.Warning('Empty prompt found, please check the prompt text.')
|
143 |
yield (target_sr, default_data)
|
144 |
if instruct_text != '':
|
145 |
+
gr.Info('You are using 3s Voice Clone mode. Pretrained Voice/Instruct will be ingnored.')
|
146 |
info = torchaudio.info(prompt_wav)
|
147 |
if info.num_frames / info.sample_rate > 10:
|
148 |
gr.Warning('Please use prompt audio shorter than 10s.')
|
|
|
153 |
set_all_random_seed(seed)
|
154 |
for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
|
155 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
156 |
+
elif mode_checkbox_group == '3s Voice Clone':
|
157 |
logging.info('get zero_shot inference request')
|
158 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
159 |
set_all_random_seed(seed)
|
160 |
for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
161 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
162 |
+
elif mode_checkbox_group == 'Cross-lingual Clone':
|
163 |
logging.info('get cross_lingual inference request')
|
164 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
165 |
set_all_random_seed(seed)
|