R1ckShi commited on
Commit
b1769de
1 Parent(s): da0a385

update words

Browse files
Files changed (1) hide show
  1. app.py +16 -16
app.py CHANGED
@@ -59,9 +59,9 @@ from cosyvoice.cli.cosyvoice import CosyVoice2
59
  from cosyvoice.utils.file_utils import load_wav, logging
60
  from cosyvoice.utils.common import set_all_random_seed
61
 
62
- inference_mode_list = ['3s Speedy Convertion', 'Natural Language Control']
63
- instruct_dict = {'3s Speedy Convertion': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button',
64
- 'Natural Language Control': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
65
  stream_mode_list = [('No', False), ('Yes', True)]
66
  max_val = 0.8
67
 
@@ -107,25 +107,25 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
107
  else:
108
  prompt_wav = None
109
  # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
110
- if mode_checkbox_group in ['Natural Language Control']:
111
  if instruct_text == '':
112
- gr.Warning('You are using Natural Language Control mode, please input the instruct.')
113
  yield (target_sr, default_data)
114
  if prompt_wav is None:
115
- gr.Info('You are using Natural Language Control mode, please upload the prompt audio.')
116
  # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
117
- if mode_checkbox_group in ['Cross-lingual Convertion']:
118
  if cosyvoice.frontend.instruct is True:
119
- gr.Warning('You are using the cross-lingual Convertion mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
120
  yield (target_sr, default_data)
121
  if instruct_text != '':
122
- gr.Info('You are using the cross-lingual Convertion mode. The instruct text will be ignored.')
123
  if prompt_wav is None:
124
- gr.Warning('You are using the cross-lingual Convertion mode. Please provide the prompt audio.')
125
  yield (target_sr, default_data)
126
- gr.Info('You are using the cross-lingual Convertion mode. Please ensure that the synthesis text and prompt text are in different languages.')
127
  # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
128
- if mode_checkbox_group in ['3s Speedy Convertion', 'Cross-lingual Convertion']:
129
  if prompt_wav is None:
130
  gr.Warning('Empty prompt found, please check the prompt text.')
131
  yield (target_sr, default_data)
@@ -137,12 +137,12 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
137
  if instruct_text != '' or prompt_wav is not None or prompt_text != '':
138
  gr.Info('You are using Pretrained Voice mode. Pretrained Voice/Instruct will be ingnored.')
139
  # zero_shot mode only use prompt_wav prompt text
140
- if mode_checkbox_group in ['3s Speedy Convertion']:
141
  if prompt_text == '':
142
  gr.Warning('Empty prompt found, please check the prompt text.')
143
  yield (target_sr, default_data)
144
  if instruct_text != '':
145
- gr.Info('You are using 3s Speedy Convertion mode. Pretrained Voice/Instruct will be ingnored.')
146
  info = torchaudio.info(prompt_wav)
147
  if info.num_frames / info.sample_rate > 10:
148
  gr.Warning('Please use prompt audio shorter than 10s.')
@@ -153,13 +153,13 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
153
  set_all_random_seed(seed)
154
  for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
155
  yield (target_sr, i['tts_speech'].numpy().flatten())
156
- elif mode_checkbox_group == '3s Speedy Convertion':
157
  logging.info('get zero_shot inference request')
158
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
159
  set_all_random_seed(seed)
160
  for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
161
  yield (target_sr, i['tts_speech'].numpy().flatten())
162
- elif mode_checkbox_group == 'Cross-lingual Convertion':
163
  logging.info('get cross_lingual inference request')
164
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
165
  set_all_random_seed(seed)
 
59
  from cosyvoice.utils.file_utils import load_wav, logging
60
  from cosyvoice.utils.common import set_all_random_seed
61
 
62
+ inference_mode_list = ['3s Voice Clone', 'Instructed Voice Generation']
63
+ instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button',
64
+ 'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
65
  stream_mode_list = [('No', False), ('Yes', True)]
66
  max_val = 0.8
67
 
 
107
  else:
108
  prompt_wav = None
109
  # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
110
+ if mode_checkbox_group in ['Instructed Voice Generation']:
111
  if instruct_text == '':
112
+ gr.Warning('You are using Instructed Voice Generation mode, please input the instruct.')
113
  yield (target_sr, default_data)
114
  if prompt_wav is None:
115
+ gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
116
  # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
117
+ if mode_checkbox_group in ['Cross-lingual Clone']:
118
  if cosyvoice.frontend.instruct is True:
119
+ gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
120
  yield (target_sr, default_data)
121
  if instruct_text != '':
122
+ gr.Info('You are using the cross-lingual Clone mode. The instruct text will be ignored.')
123
  if prompt_wav is None:
124
+ gr.Warning('You are using the cross-lingual Clone mode. Please provide the prompt audio.')
125
  yield (target_sr, default_data)
126
+ gr.Info('You are using the cross-lingual Clone mode. Please ensure that the synthesis text and prompt text are in different languages.')
127
  # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
128
+ if mode_checkbox_group in ['3s Voice Clone', 'Cross-lingual Clone']:
129
  if prompt_wav is None:
130
  gr.Warning('Empty prompt found, please check the prompt text.')
131
  yield (target_sr, default_data)
 
137
  if instruct_text != '' or prompt_wav is not None or prompt_text != '':
138
  gr.Info('You are using Pretrained Voice mode. Pretrained Voice/Instruct will be ingnored.')
139
  # zero_shot mode only use prompt_wav prompt text
140
+ if mode_checkbox_group in ['3s Voice Clone']:
141
  if prompt_text == '':
142
  gr.Warning('Empty prompt found, please check the prompt text.')
143
  yield (target_sr, default_data)
144
  if instruct_text != '':
145
+ gr.Info('You are using 3s Voice Clone mode. Pretrained Voice/Instruct will be ingnored.')
146
  info = torchaudio.info(prompt_wav)
147
  if info.num_frames / info.sample_rate > 10:
148
  gr.Warning('Please use prompt audio shorter than 10s.')
 
153
  set_all_random_seed(seed)
154
  for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
155
  yield (target_sr, i['tts_speech'].numpy().flatten())
156
+ elif mode_checkbox_group == '3s Voice Clone':
157
  logging.info('get zero_shot inference request')
158
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
159
  set_all_random_seed(seed)
160
  for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
161
  yield (target_sr, i['tts_speech'].numpy().flatten())
162
+ elif mode_checkbox_group == 'Cross-lingual Clone':
163
  logging.info('get cross_lingual inference request')
164
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
165
  set_all_random_seed(seed)