translate
Browse files
app.py
CHANGED
|
@@ -59,10 +59,10 @@ from cosyvoice.cli.cosyvoice import CosyVoice2
|
|
| 59 |
from cosyvoice.utils.file_utils import load_wav, logging
|
| 60 |
from cosyvoice.utils.common import set_all_random_seed
|
| 61 |
|
| 62 |
-
inference_mode_list = ['3s
|
| 63 |
-
instruct_dict = {'3s
|
| 64 |
-
'
|
| 65 |
-
stream_mode_list = [('
|
| 66 |
max_val = 0.8
|
| 67 |
|
| 68 |
|
|
@@ -107,66 +107,65 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
|
|
| 107 |
else:
|
| 108 |
prompt_wav = None
|
| 109 |
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
|
| 110 |
-
if mode_checkbox_group in ['
|
| 111 |
if instruct_text == '':
|
| 112 |
-
gr.Warning('
|
| 113 |
yield (target_sr, default_data)
|
| 114 |
if prompt_wav is None:
|
| 115 |
-
gr.Info('
|
| 116 |
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
| 117 |
-
if mode_checkbox_group in ['
|
| 118 |
if cosyvoice.frontend.instruct is True:
|
| 119 |
-
gr.Warning('
|
| 120 |
yield (target_sr, default_data)
|
| 121 |
if instruct_text != '':
|
| 122 |
-
gr.Info('
|
| 123 |
if prompt_wav is None:
|
| 124 |
-
gr.Warning('
|
| 125 |
yield (target_sr, default_data)
|
| 126 |
-
gr.Info('
|
| 127 |
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
|
| 128 |
-
if mode_checkbox_group in ['3s
|
| 129 |
if prompt_wav is None:
|
| 130 |
-
gr.Warning('prompt
|
| 131 |
yield (target_sr, default_data)
|
| 132 |
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
|
| 133 |
-
gr.Warning('prompt
|
| 134 |
yield (target_sr, default_data)
|
| 135 |
# sft mode only use sft_dropdown
|
| 136 |
-
if mode_checkbox_group in ['
|
| 137 |
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
|
| 138 |
-
gr.Info('
|
| 139 |
# zero_shot mode only use prompt_wav prompt text
|
| 140 |
-
if mode_checkbox_group in ['3s
|
| 141 |
if prompt_text == '':
|
| 142 |
-
gr.Warning('prompt
|
| 143 |
yield (target_sr, default_data)
|
| 144 |
if instruct_text != '':
|
| 145 |
-
gr.Info('
|
| 146 |
info = torchaudio.info(prompt_wav)
|
| 147 |
if info.num_frames / info.sample_rate > 10:
|
| 148 |
-
gr.Warning('
|
| 149 |
yield (target_sr, default_data)
|
| 150 |
|
| 151 |
-
if mode_checkbox_group == '
|
| 152 |
logging.info('get sft inference request')
|
| 153 |
set_all_random_seed(seed)
|
| 154 |
for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
|
| 155 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 156 |
-
elif mode_checkbox_group == '3s
|
| 157 |
logging.info('get zero_shot inference request')
|
| 158 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 159 |
set_all_random_seed(seed)
|
| 160 |
for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 161 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 162 |
-
elif mode_checkbox_group == '
|
| 163 |
logging.info('get cross_lingual inference request')
|
| 164 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 165 |
set_all_random_seed(seed)
|
| 166 |
for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 167 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 168 |
else:
|
| 169 |
-
logging.info('get instruct inference request')
|
| 170 |
logging.info('get instruct inference request')
|
| 171 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 172 |
set_all_random_seed(seed)
|
|
@@ -176,31 +175,31 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
|
|
| 176 |
|
| 177 |
def main():
|
| 178 |
with gr.Blocks() as demo:
|
| 179 |
-
gr.Markdown("###
|
| 180 |
-
|
| 181 |
[CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
|
| 182 |
[CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
|
| 183 |
[CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
|
| 184 |
-
gr.Markdown("####
|
| 185 |
|
| 186 |
-
tts_text = gr.Textbox(label="
|
| 187 |
with gr.Row():
|
| 188 |
-
mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='
|
| 189 |
-
instruction_text = gr.Text(label="
|
| 190 |
-
stream = gr.Radio(choices=stream_mode_list, label='
|
| 191 |
with gr.Column(scale=0.25):
|
| 192 |
seed_button = gr.Button(value="\U0001F3B2")
|
| 193 |
-
seed = gr.Number(value=0, label="
|
| 194 |
|
| 195 |
with gr.Row():
|
| 196 |
-
prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='
|
| 197 |
-
prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='
|
| 198 |
-
prompt_text = gr.Textbox(label="
|
| 199 |
-
instruct_text = gr.Textbox(label="
|
| 200 |
|
| 201 |
-
generate_button = gr.Button("
|
| 202 |
|
| 203 |
-
audio_output = gr.Audio(label="
|
| 204 |
|
| 205 |
seed_button.click(generate_seed, inputs=[], outputs=seed)
|
| 206 |
generate_button.click(generate_audio,
|
|
|
|
| 59 |
from cosyvoice.utils.file_utils import load_wav, logging
|
| 60 |
from cosyvoice.utils.common import set_all_random_seed
|
| 61 |
|
| 62 |
+
inference_mode_list = ['3s Speedy Convertion', 'Natural Language Control']
|
| 63 |
+
instruct_dict = {'3s Speedy Convertion': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button',
|
| 64 |
+
'Natural Language Control': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
|
| 65 |
+
stream_mode_list = [('No', False), ('Yes', True)]
|
| 66 |
max_val = 0.8
|
| 67 |
|
| 68 |
|
|
|
|
| 107 |
else:
|
| 108 |
prompt_wav = None
|
| 109 |
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
|
| 110 |
+
if mode_checkbox_group in ['Natural Language Control']:
|
| 111 |
if instruct_text == '':
|
| 112 |
+
gr.Warning('You are using Natural Language Control mode, please input the instruct.')
|
| 113 |
yield (target_sr, default_data)
|
| 114 |
if prompt_wav is None:
|
| 115 |
+
gr.Info('You are using Natural Language Control mode, please upload the prompt audio.')
|
| 116 |
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
| 117 |
+
if mode_checkbox_group in ['Cross-lingual Convertion']:
|
| 118 |
if cosyvoice.frontend.instruct is True:
|
| 119 |
+
gr.Warning('You are using the cross-lingual Convertion mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
|
| 120 |
yield (target_sr, default_data)
|
| 121 |
if instruct_text != '':
|
| 122 |
+
gr.Info('You are using the cross-lingual Convertion mode. The instruct text will be ignored.')
|
| 123 |
if prompt_wav is None:
|
| 124 |
+
gr.Warning('You are using the cross-lingual Convertion mode. Please provide the prompt audio.')
|
| 125 |
yield (target_sr, default_data)
|
| 126 |
+
gr.Info('You are using the cross-lingual Convertion mode. Please ensure that the synthesis text and prompt text are in different languages.')
|
| 127 |
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
|
| 128 |
+
if mode_checkbox_group in ['3s Speedy Convertion', 'Cross-lingual Convertion']:
|
| 129 |
if prompt_wav is None:
|
| 130 |
+
gr.Warning('Empty prompt found, please check the prompt text.')
|
| 131 |
yield (target_sr, default_data)
|
| 132 |
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
|
| 133 |
+
gr.Warning('prompt wav sample rate {}, lower than {}.'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
|
| 134 |
yield (target_sr, default_data)
|
| 135 |
# sft mode only use sft_dropdown
|
| 136 |
+
if mode_checkbox_group in ['Pretrained Voice']:
|
| 137 |
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
|
| 138 |
+
gr.Info('You are using Pretrained Voice mode. Pretrained Voice/Instruct will be ingnored.')
|
| 139 |
# zero_shot mode only use prompt_wav prompt text
|
| 140 |
+
if mode_checkbox_group in ['3s Speedy Convertion']:
|
| 141 |
if prompt_text == '':
|
| 142 |
+
gr.Warning('Empty prompt found, please check the prompt text.')
|
| 143 |
yield (target_sr, default_data)
|
| 144 |
if instruct_text != '':
|
| 145 |
+
gr.Info('You are using 3s Speedy Convertion mode. Pretrained Voice/Instruct will be ingnored.')
|
| 146 |
info = torchaudio.info(prompt_wav)
|
| 147 |
if info.num_frames / info.sample_rate > 10:
|
| 148 |
+
gr.Warning('Please use prompt audio shorter than 10s.')
|
| 149 |
yield (target_sr, default_data)
|
| 150 |
|
| 151 |
+
if mode_checkbox_group == 'Pretrained Voice':
|
| 152 |
logging.info('get sft inference request')
|
| 153 |
set_all_random_seed(seed)
|
| 154 |
for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
|
| 155 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 156 |
+
elif mode_checkbox_group == '3s Speedy Convertion':
|
| 157 |
logging.info('get zero_shot inference request')
|
| 158 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 159 |
set_all_random_seed(seed)
|
| 160 |
for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 161 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 162 |
+
elif mode_checkbox_group == 'Cross-lingual Convertion':
|
| 163 |
logging.info('get cross_lingual inference request')
|
| 164 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 165 |
set_all_random_seed(seed)
|
| 166 |
for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
|
| 167 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
| 168 |
else:
|
|
|
|
| 169 |
logging.info('get instruct inference request')
|
| 170 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
| 171 |
set_all_random_seed(seed)
|
|
|
|
| 175 |
|
| 176 |
def main():
|
| 177 |
with gr.Blocks() as demo:
|
| 178 |
+
gr.Markdown("### Repo [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \
|
| 179 |
+
Pretrained Model [CosyVoice2-0.5B](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B) \
|
| 180 |
[CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
|
| 181 |
[CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
|
| 182 |
[CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
|
| 183 |
+
gr.Markdown("#### Please input the text to synthesize, choose inference mode and follow the controlling steps below.")
|
| 184 |
|
| 185 |
+
tts_text = gr.Textbox(label="Text to synthesize", lines=1, value="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities. CosyVoice迎来全面升级,提供更准、更稳、更快、 更好的语音生成能力。")
|
| 186 |
with gr.Row():
|
| 187 |
+
mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='Inference Mode', value=inference_mode_list[0])
|
| 188 |
+
instruction_text = gr.Text(label="Instructions", value=instruct_dict[inference_mode_list[0]], scale=0.5)
|
| 189 |
+
stream = gr.Radio(choices=stream_mode_list, label='Streaming or not', value=stream_mode_list[0][1])
|
| 190 |
with gr.Column(scale=0.25):
|
| 191 |
seed_button = gr.Button(value="\U0001F3B2")
|
| 192 |
+
seed = gr.Number(value=0, label="Random Seed")
|
| 193 |
|
| 194 |
with gr.Row():
|
| 195 |
+
prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Prompt wav file (sample rate >= 16kHz)')
|
| 196 |
+
prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record prompt from your microphone')
|
| 197 |
+
prompt_text = gr.Textbox(label="Prompt Transcription", lines=1, placeholder="Prompt transcription (auto ASR, you can correct the recognition results)", value='')
|
| 198 |
+
instruct_text = gr.Textbox(label="Instruct", lines=1, placeholder="Instruct transcription. e.g. A old sea captain, navigates life's storms with timeless wisdom and a heart of gold.", value='')
|
| 199 |
|
| 200 |
+
generate_button = gr.Button("Speech Synthesis")
|
| 201 |
|
| 202 |
+
audio_output = gr.Audio(label="Audio Output", autoplay=True, streaming=True)
|
| 203 |
|
| 204 |
seed_button.click(generate_seed, inputs=[], outputs=seed)
|
| 205 |
generate_button.click(generate_audio,
|