Spaces:

LEMAS-Project
/

LEMAS-Edit

Running on Zero

App Files Files Community

Approximetal commited on 17 days ago

Commit

4508345

verified ·

1 Parent(s): c7f69cf

Update gradio_mix.py

Browse files

Files changed (1) hide show

gradio_mix.py +16 -49

gradio_mix.py CHANGED Viewed

@@ -68,15 +68,7 @@ def _pick_device():
     return "cuda" if torch.cuda.is_available() else "cpu"
 device = _pick_device()
-# For WhisperX ASR:
-# - On Spaces we always construct the pipeline lazily inside @spaces.GPU
-#   functions, so keep the default "cpu" here to avoid touching CUDA in
-#   the main process.
-# - Elsewhere prefer CUDA if available.
-if IS_SPACES:
-    ASR_DEVICE = "cpu"
-else:
-    ASR_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 whisper_model, align_model = None, None
 tts_edit_model = None
@@ -139,31 +131,16 @@ class UVR5:
         )
         uvr5_model = Inference(model_data, device)
-        # On HF Spaces with stateless GPU, we must not initialize CUDA in the
-        # main process. The heavy UVR5 loading happens lazily inside
-        # @spaces.GPU functions; this guard is kept only for the CPU path to
-        # avoid any accidental CUDA init.
-        if IS_SPACES and device == "cpu":
-            orig_is_available = torch.cuda.is_available
-            torch.cuda.is_available = lambda: False
-            try:
-                uvr5_model.load_model(model_path, 1)
-            finally:
-                torch.cuda.is_available = orig_is_available
-        else:
-            uvr5_model.load_model(model_path, 1)
         self.model = uvr5_model
         self.device = device
         return self.model
     def denoise(self, audio_info):
-        # Prefer GPU if available; on Spaces this runs inside @spaces.GPU so
-        # CUDA can be safely initialized here.
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        model = self.load_model(device=device)
         input_audio = load_wav(audio_info, sr=44100, channel=2)
-        output_audio = model.demix_base({0:input_audio.squeeze()}, is_match_mix=False, device=device)
         # transform = torchaudio.transforms.Resample(44100, 16000)
         # output_audio = transform(output_audio)
         return output_audio.squeeze().T.cpu().numpy(), 44100
@@ -450,9 +427,13 @@ class MMSAlignModel:
 class WhisperxModel:
     def __init__(self, model_name):
         # Lazily construct the WhisperX pipeline so that on Spaces we only
-        # touch CUDA inside @spaces.GPU workers.
         self.model_name = model_name
-        self.model = None
     def _ensure_model(self):
         if self.model is not None:
@@ -461,19 +442,11 @@ class WhisperxModel:
         prompt = None  # "This might be a blend of Simplified Chinese and English speech, do not translate, only transcription be allowed."
-        # On Spaces, this will be called from within @spaces.GPU so we can
-        # safely move the ASR to CUDA if available. Locally we respect the
-        # ASR_DEVICE hint.
-        if IS_SPACES:
-            asr_device = "cuda" if torch.cuda.is_available() else "cpu"
-        else:
-            asr_device = ASR_DEVICE
         # Use the lighter Silero VAD backend to avoid pyannote checkpoints
         # and their PyTorch 2.6 `weights_only` pickling issues.
         self.model = load_model(
             self.model_name,
-            asr_device,
             compute_type="float32",
             asr_options={
                 "suppress_numerals": False,
@@ -700,7 +673,7 @@ def get_transcribe_state(segments):
         "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in segments["words"]]
     }
-@spaces.GPU(duration=240)
 @torch.no_grad()
 @torch.inference_mode()
 def transcribe(seed, audio_info):
@@ -719,9 +692,6 @@ def transcribe(seed, audio_info):
         state
     ]
-@spaces.GPU(duration=240)
-@torch.no_grad()
-@torch.inference_mode()
 def align(transcript, audio_info, state):
     lang = state["segments"]["lang"]
     # print("realign: ", transcript, state)
@@ -747,9 +717,6 @@ def align(transcript, audio_info, state):
     ]
-@spaces.GPU(duration=240)
-@torch.no_grad()
-@torch.inference_mode()
 def denoise(audio_info):
     # Denoiser can be relatively heavy (especially UVR5), so schedule it on
     # GPU workers when running on HF Spaces.
@@ -769,7 +736,7 @@ def get_output_audio(audio_tensors, sr):
     print("save result:", result.shape)
     # wavfile.write(os.path.join(TMP_PATH, "output.wav"), sr, result)
     return (int(sr), result)
 def get_edit_audio_part(audio_info, edit_start, edit_end):
     sr, raw_wav = audio_info
@@ -796,7 +763,7 @@ def replace_numbers_with_words(sentence, lang="en"):
     return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
-@spaces.GPU(duration=240)
 @torch.no_grad()
 @torch.inference_mode()
 def run(seed, nfe_step, speed, cfg_strength, sway_sampling_coef, ref_ratio,
@@ -1069,7 +1036,7 @@ def get_app():
                         )
                         denoise_model_choice = gr.Radio(label="Denoise Model", scale=2, value="UVR5", choices=["UVR5", "DeepFilterNet"]) # "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
                         # whisper_backend_choice = gr.Radio(label="Whisper backend", value="", choices=["whisperX", "whisper"])
-                        whisper_model_choice = gr.Radio(label="Whisper model", scale=3, value="small", choices=["base", "small", "medium", "large"])
                         align_model_choice = gr.Radio(label="Forced alignment model", scale=2, value="MMS", choices=["whisperX", "MMS"], visible=False)
         with gr.Row():
@@ -1174,7 +1141,7 @@ def get_app():
                 with gr.Row():
                     nfe_step = gr.Number(
                         label="NFE Step",
-                        value=32,
                         precision=0,
                         info="Number of function evaluations (sampling steps).",
                     )

     return "cuda" if torch.cuda.is_available() else "cpu"
 device = _pick_device()
 whisper_model, align_model = None, None
 tts_edit_model = None
         )
         uvr5_model = Inference(model_data, device)
+        uvr5_model.load_model(model_path, 1)
         self.model = uvr5_model
         self.device = device
         return self.model
     def denoise(self, audio_info):
+        model = self.load_model(device="cpu")
         input_audio = load_wav(audio_info, sr=44100, channel=2)
+        output_audio = model.demix_base({0:input_audio.squeeze()}, is_match_mix=False, device="cpu")
         # transform = torchaudio.transforms.Resample(44100, 16000)
         # output_audio = transform(output_audio)
         return output_audio.squeeze().T.cpu().numpy(), 44100
 class WhisperxModel:
     def __init__(self, model_name):
         # Lazily construct the WhisperX pipeline so that on Spaces we only
+        # touch CUDA inside spaces.GPU workers.
         self.model_name = model_name
+        self.model = None
+        if IS_SPACES and torch.cuda.is_available():
+            self.device = "cuda"
+        else:
+            self.device = "cpu"
     def _ensure_model(self):
         if self.model is not None:
         prompt = None  # "This might be a blend of Simplified Chinese and English speech, do not translate, only transcription be allowed."
         # Use the lighter Silero VAD backend to avoid pyannote checkpoints
         # and their PyTorch 2.6 `weights_only` pickling issues.
         self.model = load_model(
             self.model_name,
+            self.device,
             compute_type="float32",
             asr_options={
                 "suppress_numerals": False,
         "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in segments["words"]]
     }
+@spaces.GPU
 @torch.no_grad()
 @torch.inference_mode()
 def transcribe(seed, audio_info):
         state
     ]
 def align(transcript, audio_info, state):
     lang = state["segments"]["lang"]
     # print("realign: ", transcript, state)
     ]
 def denoise(audio_info):
     # Denoiser can be relatively heavy (especially UVR5), so schedule it on
     # GPU workers when running on HF Spaces.
     print("save result:", result.shape)
     # wavfile.write(os.path.join(TMP_PATH, "output.wav"), sr, result)
     return (int(sr), result)
 def get_edit_audio_part(audio_info, edit_start, edit_end):
     sr, raw_wav = audio_info
     return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
+@spaces.GPU
 @torch.no_grad()
 @torch.inference_mode()
 def run(seed, nfe_step, speed, cfg_strength, sway_sampling_coef, ref_ratio,
                         )
                         denoise_model_choice = gr.Radio(label="Denoise Model", scale=2, value="UVR5", choices=["UVR5", "DeepFilterNet"]) # "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
                         # whisper_backend_choice = gr.Radio(label="Whisper backend", value="", choices=["whisperX", "whisper"])
+                        whisper_model_choice = gr.Radio(label="Whisper model", scale=3, value="medium", choices=["base", "small", "medium", "large"])
                         align_model_choice = gr.Radio(label="Forced alignment model", scale=2, value="MMS", choices=["whisperX", "MMS"], visible=False)
         with gr.Row():
                 with gr.Row():
                     nfe_step = gr.Number(
                         label="NFE Step",
+                        value=64,
                         precision=0,
                         info="Number of function evaluations (sampling steps).",
                     )