XTTS_finetuned_dani

Sleeping

App Files Files Community

RedSparkie commited on Sep 20, 2024

Commit

1a3ef6b

verified ·

1 Parent(s): 3408722

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -31

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import torch
 from TTS.api import TTS
@@ -11,9 +12,6 @@ from TTS.tts.models.xtts import Xtts
 # Aceptar los términos de COQUI
 os.environ["COQUI_TOS_AGREED"] = "1"
-# Establecer precisión reducida para acelerar en CPU
-torch.set_default_dtype(torch.float16)
 # Definir el dispositivo como CPU
 device = "cpu"
@@ -22,24 +20,20 @@ model_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="model.pt
 config_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="config.json")
 vocab_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="vocab.json")
-# Función para resamplear el audio a 24000 Hz y convertirlo a 16 bits
-def preprocess_audio(audio_path, target_sr=24000):
-    waveform, original_sr = torchaudio.load(audio_path)
-    # Resamplear si la frecuencia de muestreo es diferente
-    if original_sr != target_sr:
-        resampler = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=target_sr)
-        waveform = resampler(waveform)
-    # Convertir a 16 bits
-    waveform = waveform * (2**15)  # Escalar para 16 bits
-    waveform = waveform.to(torch.int16)  # Convertir a formato de 16 bits
-    return waveform, target_sr
 # Cargar el modelo XTTS
 XTTS_MODEL = None
 def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
     global XTTS_MODEL
     config = XttsConfig()
     config.load_json(xtts_config)
@@ -48,7 +42,8 @@ def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
     print("Loading XTTS model!")
     # Cargar el checkpoint del modelo
-    XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
     print("Model Loaded!")
 # Función para ejecutar TTS
@@ -56,25 +51,14 @@ def run_tts(lang, tts_text, speaker_audio_file):
     if XTTS_MODEL is None or not speaker_audio_file:
         return "You need to run the previous step to load the model !!", None, None
-    # Preprocesar el audio (resampleo a 24000 Hz y conversión a 16 bits)
-    waveform, sr = preprocess_audio(speaker_audio_file)
-    # Guardar el audio procesado temporalmente para usarlo con el modelo
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
-        torchaudio.save(fp.name, waveform, sr)
-        processed_audio_path = fp.name
     # Usar inference_mode para mejorar el rendimiento
     with torch.inference_mode():
         gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
-            audio_path=processed_audio_path,
             gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
             max_ref_length=XTTS_MODEL.config.max_ref_len,
             sound_norm_refs=XTTS_MODEL.config.sound_norm_refs
         )
-        if gpt_cond_latent is None or speaker_embedding is None:
-            return "Failed to process the audio file.", None, None
         out = XTTS_MODEL.inference(
             text=tts_text,
@@ -98,6 +82,7 @@ def run_tts(lang, tts_text, speaker_audio_file):
     return out_path, speaker_audio_file
 # Definir la función para Gradio
 def generate(text, audio):
     load_model(model_path, config_path, vocab_path)
     out_path, speaker_audio_file = run_tts(lang='es', tts_text=text, speaker_audio_file=audio)
@@ -110,5 +95,5 @@ demo = gr.Interface(
     outputs=gr.Audio(type='filepath')
 )
-# Lanzar la interfaz con un enlace público
-demo.launch(share=True)

+import spaces
 import gradio as gr
 import torch
 from TTS.api import TTS
 # Aceptar los términos de COQUI
 os.environ["COQUI_TOS_AGREED"] = "1"
 # Definir el dispositivo como CPU
 device = "cpu"
 config_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="config.json")
 vocab_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="vocab.json")
+# Función para limpiar la caché de GPU (por si en el futuro se usa GPU)
+def clear_gpu_cache():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
 # Cargar el modelo XTTS
 XTTS_MODEL = None
 def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
     global XTTS_MODEL
+    clear_gpu_cache()
+    if not xtts_checkpoint or not xtts_config or not xtts_vocab:
+        return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
+    # Configuración del modelo
     config = XttsConfig()
     config.load_json(xtts_config)
     print("Loading XTTS model!")
     # Cargar el checkpoint del modelo
+    XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False, weights_only=True)
     print("Model Loaded!")
 # Función para ejecutar TTS
     if XTTS_MODEL is None or not speaker_audio_file:
         return "You need to run the previous step to load the model !!", None, None
     # Usar inference_mode para mejorar el rendimiento
     with torch.inference_mode():
         gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
+            audio_path=speaker_audio_file,
             gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
             max_ref_length=XTTS_MODEL.config.max_ref_len,
             sound_norm_refs=XTTS_MODEL.config.sound_norm_refs
         )
         out = XTTS_MODEL.inference(
             text=tts_text,
     return out_path, speaker_audio_file
 # Definir la función para Gradio
+@spaces.GPU
 def generate(text, audio):
     load_model(model_path, config_path, vocab_path)
     out_path, speaker_audio_file = run_tts(lang='es', tts_text=text, speaker_audio_file=audio)
     outputs=gr.Audio(type='filepath')
 )
+# Lanzar la interfaz
+demo.launch()