vaevox

Sleeping

App Files Files Community

Humair332 commited on 17 days ago

Commit

da16507

verified ·

1 Parent(s): f8f2ee2

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -14

app.py CHANGED Viewed

@@ -49,12 +49,10 @@ class CausalTransposeConv1d(nn.ConvTranspose1d):
         return super().forward(x)[..., : -(self.__padding * 2 - self.__output_padding)]
 def WNCausalConv1d(*args, **kwargs):
     return weight_norm(CausalConv1d(*args, **kwargs))
 def WNCausalTransposeConv1d(*args, **kwargs):
     return weight_norm(CausalTransposeConv1d(*args, **kwargs))
@@ -521,6 +519,10 @@ class LoadedCodec:
     def sample_rate(self) -> int:
         return int(self.model.sample_rate)
     @property
     def hop_length(self) -> int:
         return int(self.model.hop_length)
@@ -532,7 +534,6 @@ class LoadedCodec:
         return self.model.decode(z)
 def _pick_state_dict(obj):
     if isinstance(obj, dict):
         for key in ("state_dict", "model", "vae", "audio_vae", "module"):
@@ -574,7 +575,6 @@ def load_audio_file(path: str) -> Tuple[np.ndarray, int]:
     return audio.astype(np.float32), int(sr)
 def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
     if orig_sr == target_sr:
         return audio
@@ -582,12 +582,10 @@ def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarra
     return scipy_resample(audio, num_samples).astype(np.float32)
 def to_tensor(audio: np.ndarray, device: str) -> torch.Tensor:
     return torch.from_numpy(audio).unsqueeze(0).unsqueeze(0).to(device)
 def save_wav_temp(wav: np.ndarray, sr: int) -> str:
     fd, path = tempfile.mkstemp(suffix=".wav")
     os.close(fd)
@@ -595,7 +593,6 @@ def save_wav_temp(wav: np.ndarray, sr: int) -> str:
     return path
 def fmt_stats(kv: dict) -> str:
     lines = ["| Property | Value |", "|---|---|"]
     for k, v in kv.items():
@@ -624,7 +621,8 @@ def encode_audio(file_path):
     stats = {
         "Original SR": f"{sr} Hz",
-        "Model SR": f"{codec.sample_rate} Hz",
         "Original samples": f"{orig_len:,}",
         "Resampled samples": f"{len(audio):,}",
         "Latent shape": str(tuple(latent.shape)),
@@ -639,7 +637,6 @@ def encode_audio(file_path):
     return latent.tolist(), latent.tolist(), fmt_stats(stats)
 def decode_audio(latent_list, current_stats):
     if latent_list is None:
         return None, (current_stats or "") + "\n\nNo latent found. Encode first."
@@ -659,15 +656,20 @@ def decode_audio(latent_list, current_stats):
     wav = np.nan_to_num(wav)
     wav = np.clip(wav, -1.0, 1.0)
     stats = {
         "Decoded samples": f"{len(wav):,}",
-        "Output SR": f"{codec.sample_rate} Hz",
-        "Duration": f"{len(wav) / codec.sample_rate:.4f} s",
         "Wave min/max": f"{wav.min():.4f} / {wav.max():.4f}",
     }
     merged = (current_stats or "") + "\n\n### Decode Stats\n" + fmt_stats(stats)
-    return (codec.sample_rate, wav), merged
 # =========================================================
@@ -697,7 +699,8 @@ with gr.Blocks(css=CSS, title="AudioVAE Encode / Decode") as demo:
 Standalone one-file app for `audiovae.pth`.
 **Repo:** `{REPO_ID}`
-**Model SR:** `{codec.sample_rate} Hz`
 **Hop length:** `{codec.hop_length}`
 """
     )
@@ -731,4 +734,4 @@ Standalone one-file app for `audiovae.pth`.
 if __name__ == "__main__":
-    demo.launch()

         return super().forward(x)[..., : -(self.__padding * 2 - self.__output_padding)]
 def WNCausalConv1d(*args, **kwargs):
     return weight_norm(CausalConv1d(*args, **kwargs))
 def WNCausalTransposeConv1d(*args, **kwargs):
     return weight_norm(CausalTransposeConv1d(*args, **kwargs))
     def sample_rate(self) -> int:
         return int(self.model.sample_rate)
+    @property
+    def out_sample_rate(self) -> int:                  # ✅ NEW: expose out_sample_rate
+        return int(self.model.out_sample_rate)
     @property
     def hop_length(self) -> int:
         return int(self.model.hop_length)
         return self.model.decode(z)
 def _pick_state_dict(obj):
     if isinstance(obj, dict):
         for key in ("state_dict", "model", "vae", "audio_vae", "module"):
     return audio.astype(np.float32), int(sr)
 def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
     if orig_sr == target_sr:
         return audio
     return scipy_resample(audio, num_samples).astype(np.float32)
 def to_tensor(audio: np.ndarray, device: str) -> torch.Tensor:
     return torch.from_numpy(audio).unsqueeze(0).unsqueeze(0).to(device)
 def save_wav_temp(wav: np.ndarray, sr: int) -> str:
     fd, path = tempfile.mkstemp(suffix=".wav")
     os.close(fd)
     return path
 def fmt_stats(kv: dict) -> str:
     lines = ["| Property | Value |", "|---|---|"]
     for k, v in kv.items():
     stats = {
         "Original SR": f"{sr} Hz",
+        "Model input SR": f"{codec.sample_rate} Hz",
+        "Model output SR": f"{codec.out_sample_rate} Hz",   # ✅ shown for clarity
         "Original samples": f"{orig_len:,}",
         "Resampled samples": f"{len(audio):,}",
         "Latent shape": str(tuple(latent.shape)),
     return latent.tolist(), latent.tolist(), fmt_stats(stats)
 def decode_audio(latent_list, current_stats):
     if latent_list is None:
         return None, (current_stats or "") + "\n\nNo latent found. Encode first."
     wav = np.nan_to_num(wav)
     wav = np.clip(wav, -1.0, 1.0)
+    # ✅ FIX: use out_sample_rate (48000), NOT sample_rate (16000).
+    # The decoder upsamples by prod(decoder_rates) = 8×6×5×2×2×2 = 1920,
+    # so the output SR is 48000 Hz, not 16000 Hz.
+    out_sr = codec.out_sample_rate
     stats = {
         "Decoded samples": f"{len(wav):,}",
+        "Output SR": f"{out_sr} Hz",                        # ✅ 48000
+        "Duration": f"{len(wav) / out_sr:.4f} s",           # ✅ correct duration
         "Wave min/max": f"{wav.min():.4f} / {wav.max():.4f}",
     }
     merged = (current_stats or "") + "\n\n### Decode Stats\n" + fmt_stats(stats)
+    return (out_sr, wav), merged                             # ✅ tell Gradio correct SR
 # =========================================================
 Standalone one-file app for `audiovae.pth`.
 **Repo:** `{REPO_ID}`
+**Model input SR:** `{codec.sample_rate} Hz`
+**Model output SR:** `{codec.out_sample_rate} Hz`
 **Hop length:** `{codec.hop_length}`
 """
     )
 if __name__ == "__main__":
+    demo.launch()