vitaliy-sharandin commited on
Commit
3fd57b8
1 Parent(s): 2353dd0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -9
app.py CHANGED
@@ -161,10 +161,7 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
161
  final_audio_track = None
162
 
163
  try:
164
- # TODO uncomment when https://github.com/coqui-ai/TTS/issues/3224 is resolved
165
- # tts = TTS(selected_model).to(device)
166
-
167
- # Generate and concatenate voice clips per speaker
168
 
169
  last_end_time = 0
170
  clips = []
@@ -184,9 +181,18 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
184
 
185
  # Generate speech
186
  print(f"[{speech_item['speaker']}]")
187
- tts = TTS(selected_model).to(device)
188
- audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
189
- sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
 
 
 
 
 
 
 
 
 
190
 
191
  # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
192
  audio_duration = len(audio) / sample_rate
@@ -209,8 +215,6 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
209
 
210
  last_end_time = speech_item['start'] + audio_clip.duration
211
 
212
- del tts; import gc; gc.collect(); torch.cuda.empty_cache()
213
-
214
  # Merge sentences
215
  final_audio_track = mp.concatenate_audioclips(clips)
216
 
 
161
  final_audio_track = None
162
 
163
  try:
164
+ tts = TTS(selected_model).to(device)
 
 
 
165
 
166
  last_end_time = 0
167
  clips = []
 
181
 
182
  # Generate speech
183
  print(f"[{speech_item['speaker']}]")
184
+
185
+ sample_rate = None
186
+ audio = None
187
+ if 'vits' in selected_model:
188
+ audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
189
+ sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
190
+ else:
191
+ # TODO remove when https://github.com/coqui-ai/TTS/issues/3224 is resolved
192
+ tts = TTS(selected_model).to(device)
193
+ audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
194
+ sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
195
+ del tts; import gc; gc.collect(); torch.cuda.empty_cache()
196
 
197
  # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
198
  audio_duration = len(audio) / sample_rate
 
215
 
216
  last_end_time = speech_item['start'] + audio_clip.duration
217
 
 
 
218
  # Merge sentences
219
  final_audio_track = mp.concatenate_audioclips(clips)
220