Humair332 commited on
Commit
da16507
·
verified ·
1 Parent(s): f8f2ee2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -14
app.py CHANGED
@@ -49,12 +49,10 @@ class CausalTransposeConv1d(nn.ConvTranspose1d):
49
  return super().forward(x)[..., : -(self.__padding * 2 - self.__output_padding)]
50
 
51
 
52
-
53
  def WNCausalConv1d(*args, **kwargs):
54
  return weight_norm(CausalConv1d(*args, **kwargs))
55
 
56
 
57
-
58
  def WNCausalTransposeConv1d(*args, **kwargs):
59
  return weight_norm(CausalTransposeConv1d(*args, **kwargs))
60
 
@@ -521,6 +519,10 @@ class LoadedCodec:
521
  def sample_rate(self) -> int:
522
  return int(self.model.sample_rate)
523
 
 
 
 
 
524
  @property
525
  def hop_length(self) -> int:
526
  return int(self.model.hop_length)
@@ -532,7 +534,6 @@ class LoadedCodec:
532
  return self.model.decode(z)
533
 
534
 
535
-
536
  def _pick_state_dict(obj):
537
  if isinstance(obj, dict):
538
  for key in ("state_dict", "model", "vae", "audio_vae", "module"):
@@ -574,7 +575,6 @@ def load_audio_file(path: str) -> Tuple[np.ndarray, int]:
574
  return audio.astype(np.float32), int(sr)
575
 
576
 
577
-
578
  def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
579
  if orig_sr == target_sr:
580
  return audio
@@ -582,12 +582,10 @@ def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarra
582
  return scipy_resample(audio, num_samples).astype(np.float32)
583
 
584
 
585
-
586
  def to_tensor(audio: np.ndarray, device: str) -> torch.Tensor:
587
  return torch.from_numpy(audio).unsqueeze(0).unsqueeze(0).to(device)
588
 
589
 
590
-
591
  def save_wav_temp(wav: np.ndarray, sr: int) -> str:
592
  fd, path = tempfile.mkstemp(suffix=".wav")
593
  os.close(fd)
@@ -595,7 +593,6 @@ def save_wav_temp(wav: np.ndarray, sr: int) -> str:
595
  return path
596
 
597
 
598
-
599
  def fmt_stats(kv: dict) -> str:
600
  lines = ["| Property | Value |", "|---|---|"]
601
  for k, v in kv.items():
@@ -624,7 +621,8 @@ def encode_audio(file_path):
624
 
625
  stats = {
626
  "Original SR": f"{sr} Hz",
627
- "Model SR": f"{codec.sample_rate} Hz",
 
628
  "Original samples": f"{orig_len:,}",
629
  "Resampled samples": f"{len(audio):,}",
630
  "Latent shape": str(tuple(latent.shape)),
@@ -639,7 +637,6 @@ def encode_audio(file_path):
639
  return latent.tolist(), latent.tolist(), fmt_stats(stats)
640
 
641
 
642
-
643
  def decode_audio(latent_list, current_stats):
644
  if latent_list is None:
645
  return None, (current_stats or "") + "\n\nNo latent found. Encode first."
@@ -659,15 +656,20 @@ def decode_audio(latent_list, current_stats):
659
  wav = np.nan_to_num(wav)
660
  wav = np.clip(wav, -1.0, 1.0)
661
 
 
 
 
 
 
662
  stats = {
663
  "Decoded samples": f"{len(wav):,}",
664
- "Output SR": f"{codec.sample_rate} Hz",
665
- "Duration": f"{len(wav) / codec.sample_rate:.4f} s",
666
  "Wave min/max": f"{wav.min():.4f} / {wav.max():.4f}",
667
  }
668
 
669
  merged = (current_stats or "") + "\n\n### Decode Stats\n" + fmt_stats(stats)
670
- return (codec.sample_rate, wav), merged
671
 
672
 
673
  # =========================================================
@@ -697,7 +699,8 @@ with gr.Blocks(css=CSS, title="AudioVAE Encode / Decode") as demo:
697
  Standalone one-file app for `audiovae.pth`.
698
 
699
  **Repo:** `{REPO_ID}`
700
- **Model SR:** `{codec.sample_rate} Hz`
 
701
  **Hop length:** `{codec.hop_length}`
702
  """
703
  )
@@ -731,4 +734,4 @@ Standalone one-file app for `audiovae.pth`.
731
 
732
 
733
  if __name__ == "__main__":
734
- demo.launch()
 
49
  return super().forward(x)[..., : -(self.__padding * 2 - self.__output_padding)]
50
 
51
 
 
52
  def WNCausalConv1d(*args, **kwargs):
53
  return weight_norm(CausalConv1d(*args, **kwargs))
54
 
55
 
 
56
  def WNCausalTransposeConv1d(*args, **kwargs):
57
  return weight_norm(CausalTransposeConv1d(*args, **kwargs))
58
 
 
519
  def sample_rate(self) -> int:
520
  return int(self.model.sample_rate)
521
 
522
+ @property
523
+ def out_sample_rate(self) -> int: # ✅ NEW: expose out_sample_rate
524
+ return int(self.model.out_sample_rate)
525
+
526
  @property
527
  def hop_length(self) -> int:
528
  return int(self.model.hop_length)
 
534
  return self.model.decode(z)
535
 
536
 
 
537
  def _pick_state_dict(obj):
538
  if isinstance(obj, dict):
539
  for key in ("state_dict", "model", "vae", "audio_vae", "module"):
 
575
  return audio.astype(np.float32), int(sr)
576
 
577
 
 
578
  def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
579
  if orig_sr == target_sr:
580
  return audio
 
582
  return scipy_resample(audio, num_samples).astype(np.float32)
583
 
584
 
 
585
  def to_tensor(audio: np.ndarray, device: str) -> torch.Tensor:
586
  return torch.from_numpy(audio).unsqueeze(0).unsqueeze(0).to(device)
587
 
588
 
 
589
  def save_wav_temp(wav: np.ndarray, sr: int) -> str:
590
  fd, path = tempfile.mkstemp(suffix=".wav")
591
  os.close(fd)
 
593
  return path
594
 
595
 
 
596
  def fmt_stats(kv: dict) -> str:
597
  lines = ["| Property | Value |", "|---|---|"]
598
  for k, v in kv.items():
 
621
 
622
  stats = {
623
  "Original SR": f"{sr} Hz",
624
+ "Model input SR": f"{codec.sample_rate} Hz",
625
+ "Model output SR": f"{codec.out_sample_rate} Hz", # ✅ shown for clarity
626
  "Original samples": f"{orig_len:,}",
627
  "Resampled samples": f"{len(audio):,}",
628
  "Latent shape": str(tuple(latent.shape)),
 
637
  return latent.tolist(), latent.tolist(), fmt_stats(stats)
638
 
639
 
 
640
  def decode_audio(latent_list, current_stats):
641
  if latent_list is None:
642
  return None, (current_stats or "") + "\n\nNo latent found. Encode first."
 
656
  wav = np.nan_to_num(wav)
657
  wav = np.clip(wav, -1.0, 1.0)
658
 
659
+ # ✅ FIX: use out_sample_rate (48000), NOT sample_rate (16000).
660
+ # The decoder upsamples by prod(decoder_rates) = 8×6×5×2×2×2 = 1920,
661
+ # so the output SR is 48000 Hz, not 16000 Hz.
662
+ out_sr = codec.out_sample_rate
663
+
664
  stats = {
665
  "Decoded samples": f"{len(wav):,}",
666
+ "Output SR": f"{out_sr} Hz", # ✅ 48000
667
+ "Duration": f"{len(wav) / out_sr:.4f} s", # ✅ correct duration
668
  "Wave min/max": f"{wav.min():.4f} / {wav.max():.4f}",
669
  }
670
 
671
  merged = (current_stats or "") + "\n\n### Decode Stats\n" + fmt_stats(stats)
672
+ return (out_sr, wav), merged # ✅ tell Gradio correct SR
673
 
674
 
675
  # =========================================================
 
699
  Standalone one-file app for `audiovae.pth`.
700
 
701
  **Repo:** `{REPO_ID}`
702
+ **Model input SR:** `{codec.sample_rate} Hz`
703
+ **Model output SR:** `{codec.out_sample_rate} Hz`
704
  **Hop length:** `{codec.hop_length}`
705
  """
706
  )
 
734
 
735
 
736
  if __name__ == "__main__":
737
+ demo.launch()