Spaces:

gaunernst
/

AudioMAE-AudioSet20k

Sleeping

App Files Files Community

gaunernst commited on Nov 25, 2023

Commit

5b04966

•

1 Parent(s): bcc0935

fix stereo audio

Browse files

Files changed (2) hide show

app.py +9 -13
packages.txt +0 -1

app.py CHANGED Viewed

@@ -1,14 +1,12 @@
 import json
-import shlex
-import subprocess
 import gradio as gr
-import numpy as np
 import requests
 import timm
 import torch
 import torch.nn.functional as F
 from torchaudio.compliance import kaldi
 TAG = "gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k"
 MODEL = timm.create_model(f"hf_hub:{TAG}", pretrained=True).eval()
@@ -21,12 +19,6 @@ MEAN = -4.2677393
 STD = 4.5689974
-def resample(x: np.ndarray, sr: int):
-    cmd = f"ffmpeg -ar {sr} -f s16le -i - -ar {SAMPLING_RATE} -f f32le -"
-    proc = subprocess.run(shlex.split(cmd), capture_output=True, input=x.tobytes())
-    return np.frombuffer(proc.stdout, dtype=np.float32)
 def preprocess(x: torch.Tensor):
     x = x - x.mean()
     melspec = kaldi.fbank(x.unsqueeze(0), htk_compat=True, window_type="hanning", num_mel_bins=128)
@@ -35,7 +27,7 @@ def preprocess(x: torch.Tensor):
     else:
         melspec = melspec[:1024]
     melspec = (melspec - MEAN) / (STD * 2)
-    return melspec.view(1, 1, 1024, 128)
 def predict(audio, start):
@@ -43,11 +35,15 @@ def predict(audio, start):
     if x.shape[0] < start * sr:
         raise gr.Error(f"`start` ({start}) must be smaller than audio duration ({x.shape[0] / sr:.0f}s)")
-    x = resample(x[int(start * sr) :], sr)
-    x = torch.from_numpy(x)
     with torch.inference_mode():
-        logits = MODEL(preprocess(x)).squeeze(0)
     topk_probs, topk_classes = logits.sigmoid().topk(10)
     return [[AUDIOSET_LABELS[cls], prob.item() * 100] for cls, prob in zip(topk_classes, topk_probs)]

 import json
 import gradio as gr
 import requests
 import timm
 import torch
 import torch.nn.functional as F
 from torchaudio.compliance import kaldi
+from torchaudio.functional import resample
 TAG = "gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k"
 MODEL = timm.create_model(f"hf_hub:{TAG}", pretrained=True).eval()
 STD = 4.5689974
 def preprocess(x: torch.Tensor):
     x = x - x.mean()
     melspec = kaldi.fbank(x.unsqueeze(0), htk_compat=True, window_type="hanning", num_mel_bins=128)
     else:
         melspec = melspec[:1024]
     melspec = (melspec - MEAN) / (STD * 2)
+    return melspec.view(1, 1024, 128)
 def predict(audio, start):
     if x.shape[0] < start * sr:
         raise gr.Error(f"`start` ({start}) must be smaller than audio duration ({x.shape[0] / sr:.0f}s)")
+    x = torch.from_numpy(x) / (1 << 15)
+    if x.ndim > 1:
+        x = x.mean(-1)
+    assert x.ndim == 1
+    x = resample(x[int(start * sr) :], sr, SAMPLING_RATE)
+    x = preprocess(x)
     with torch.inference_mode():
+        logits = MODEL(x.unsqueeze(0)).squeeze(0)
     topk_probs, topk_classes = logits.sigmoid().topk(10)
     return [[AUDIOSET_LABELS[cls], prob.item() * 100] for cls, prob in zip(topk_classes, topk_probs)]

packages.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- ffmpeg