barreloflube
commited on
Commit
•
89b8fd1
1
Parent(s):
aab35a3
Refactor gen_audio function to use CosyVoice TTS instead of SFT
Browse files- tabs/audios/events.py +3 -2
- tabs/audios/load_models.py +5 -5
tabs/audios/events.py
CHANGED
@@ -168,8 +168,9 @@ def gen_audio(text, mode, sft_speaker = None, speaker_audio = None, voice_instru
|
|
168 |
if not speaker_audio_file:
|
169 |
raise gr.Error('Please upload an audio')
|
170 |
|
171 |
-
for i, j in enumerate(
|
172 |
tts_text=text,
|
|
|
173 |
prompt_speech_16k=prompt_speech_16k,
|
174 |
)):
|
175 |
torchaudio.save(
|
@@ -181,7 +182,7 @@ def gen_audio(text, mode, sft_speaker = None, speaker_audio = None, voice_instru
|
|
181 |
if not speaker_audio_file:
|
182 |
raise gr.Error('Please upload an audio')
|
183 |
|
184 |
-
for i, j in enumerate(
|
185 |
tts_text=text,
|
186 |
prompt_speech_16k=prompt_speech_16k,
|
187 |
)):
|
|
|
168 |
if not speaker_audio_file:
|
169 |
raise gr.Error('Please upload an audio')
|
170 |
|
171 |
+
for i, j in enumerate(cv_vc.inference_zero_shot(
|
172 |
tts_text=text,
|
173 |
+
prompt_text=voice_instructions,
|
174 |
prompt_speech_16k=prompt_speech_16k,
|
175 |
)):
|
176 |
torchaudio.save(
|
|
|
182 |
if not speaker_audio_file:
|
183 |
raise gr.Error('Please upload an audio')
|
184 |
|
185 |
+
for i, j in enumerate(cv_vc.inference_cross_lingual(
|
186 |
tts_text=text,
|
187 |
prompt_speech_16k=prompt_speech_16k,
|
188 |
)):
|
tabs/audios/load_models.py
CHANGED
@@ -23,16 +23,16 @@ def init_sys():
|
|
23 |
# Add `tabs/audios/modules/CosyVoice/third_party/Matcha-TTS` to your `PYTHONPATH`
|
24 |
os.environ['PYTHONPATH'] = f'{os.path.dirname(__file__)}/modules/CosyVoice/third_party/Matcha-TTS:{os.environ.get("PYTHONPATH", "")}'
|
25 |
|
26 |
-
# Load CosyVoice TTS
|
27 |
-
cv_base = CosyVoice('pretrained_models/CosyVoice-300M')
|
28 |
-
|
29 |
# Load CosyVoice SFT
|
30 |
cv_sft = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
|
31 |
sft_speakers = cv_sft.list_avaliable_spks()
|
32 |
|
|
|
|
|
|
|
33 |
# Load CosyVoice Instruct
|
34 |
cv_instruct = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
35 |
|
36 |
-
return device, df_model, df_state,
|
37 |
|
38 |
-
device, df_model, df_state,
|
|
|
23 |
# Add `tabs/audios/modules/CosyVoice/third_party/Matcha-TTS` to your `PYTHONPATH`
|
24 |
os.environ['PYTHONPATH'] = f'{os.path.dirname(__file__)}/modules/CosyVoice/third_party/Matcha-TTS:{os.environ.get("PYTHONPATH", "")}'
|
25 |
|
|
|
|
|
|
|
26 |
# Load CosyVoice SFT
|
27 |
cv_sft = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
|
28 |
sft_speakers = cv_sft.list_avaliable_spks()
|
29 |
|
30 |
+
# Load CosyVoice TTS
|
31 |
+
cv_vc = CosyVoice('pretrained_models/CosyVoice-300M')
|
32 |
+
|
33 |
# Load CosyVoice Instruct
|
34 |
cv_instruct = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
35 |
|
36 |
+
return device, df_model, df_state, cv_vc, cv_sft, sft_speakers, cv_instruct
|
37 |
|
38 |
+
device, df_model, df_state, cv_vc, cv_sft, sft_speakers, cv_instruct = init_sys()
|