vitaliy-sharandin
commited on
Commit
•
3fd57b8
1
Parent(s):
2353dd0
Update app.py
Browse files
app.py
CHANGED
@@ -161,10 +161,7 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
|
|
161 |
final_audio_track = None
|
162 |
|
163 |
try:
|
164 |
-
|
165 |
-
# tts = TTS(selected_model).to(device)
|
166 |
-
|
167 |
-
# Generate and concatenate voice clips per speaker
|
168 |
|
169 |
last_end_time = 0
|
170 |
clips = []
|
@@ -184,9 +181,18 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
|
|
184 |
|
185 |
# Generate speech
|
186 |
print(f"[{speech_item['speaker']}]")
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
# Adjust pace to fit the speech timeframe if translated audio is longer than phrase
|
192 |
audio_duration = len(audio) / sample_rate
|
@@ -209,8 +215,6 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
|
|
209 |
|
210 |
last_end_time = speech_item['start'] + audio_clip.duration
|
211 |
|
212 |
-
del tts; import gc; gc.collect(); torch.cuda.empty_cache()
|
213 |
-
|
214 |
# Merge sentences
|
215 |
final_audio_track = mp.concatenate_audioclips(clips)
|
216 |
|
|
|
161 |
final_audio_track = None
|
162 |
|
163 |
try:
|
164 |
+
tts = TTS(selected_model).to(device)
|
|
|
|
|
|
|
165 |
|
166 |
last_end_time = 0
|
167 |
clips = []
|
|
|
181 |
|
182 |
# Generate speech
|
183 |
print(f"[{speech_item['speaker']}]")
|
184 |
+
|
185 |
+
sample_rate = None
|
186 |
+
audio = None
|
187 |
+
if 'vits' in selected_model:
|
188 |
+
audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
|
189 |
+
sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
|
190 |
+
else:
|
191 |
+
# TODO remove when https://github.com/coqui-ai/TTS/issues/3224 is resolved
|
192 |
+
tts = TTS(selected_model).to(device)
|
193 |
+
audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
|
194 |
+
sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
|
195 |
+
del tts; import gc; gc.collect(); torch.cuda.empty_cache()
|
196 |
|
197 |
# Adjust pace to fit the speech timeframe if translated audio is longer than phrase
|
198 |
audio_duration = len(audio) / sample_rate
|
|
|
215 |
|
216 |
last_end_time = speech_item['start'] + audio_clip.duration
|
217 |
|
|
|
|
|
218 |
# Merge sentences
|
219 |
final_audio_track = mp.concatenate_audioclips(clips)
|
220 |
|