Spaces:

VOJ
/

voj

Sleeping

amroa commited on Jun 1, 2024

Commit

79fcc82

1 Parent(s): 4b48e6e

add audio MAE

Files changed (4) hide show

__pycache__/app.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ

__pycache__/classpred.cpython-311.pyc ADDED Viewed

Binary file (3.47 kB). View file

app.py CHANGED Viewed

@@ -9,7 +9,9 @@ from model import BirdAST
 import torch
 import librosa
 import noisereduce as nr
 import pandas as pd
 import torch.nn.functional as F
 import random
 from torchaudio.compliance import kaldi
@@ -56,7 +58,7 @@ def predict(audio, start, end):
     sr, x = audio
     x = np.array(x, dtype=np.float32)/32768.0
-    x = x[start*sr : end*sr]
     res = preprocess_for_inference(x, sr)
     if start >= end:
@@ -72,7 +74,7 @@ def predict(audio, start, end):
     fig2 = plot_wave(sr, x)
-    return res, res, fig1, fig2
 def download_model(url, model_path):
     if not os.path.exists(model_path):

 import torch
 import librosa
 import noisereduce as nr
+import timm
 import pandas as pd
+from classpred import predict_class
 import torch.nn.functional as F
 import random
 from torchaudio.compliance import kaldi
     sr, x = audio
     x = np.array(x, dtype=np.float32)/32768.0
+    x = x[int(start*sr) : int(end*sr)]
     res = preprocess_for_inference(x, sr)
     if start >= end:
     fig2 = plot_wave(sr, x)
+    return predict_class(x, sr, start, end), res, fig1, fig2
 def download_model(url, model_path):
     if not os.path.exists(model_path):

classpred.py ADDED Viewed

+import timm
+import json
+import torch
+from torchaudio.functional import resample
+import numpy as np
+from torchaudio.compliance import kaldi
+import torch.nn.functional as F
+import requests
+TAG = "gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k"
+MODEL = timm.create_model(f"hf_hub:{TAG}", pretrained=True).eval()
+LABEL_URL = "https://huggingface.co/datasets/huggingface/label-files/raw/main/audioset-id2label.json"
+AUDIOSET_LABELS = list(json.loads(requests.get(LABEL_URL).content).values())
+SAMPLING_RATE = 16_000
+MEAN = -4.2677393
+STD = 4.5689974
+def preprocess(x: torch.Tensor):
+    x = x - x.mean()
+    melspec = kaldi.fbank(x.unsqueeze(0), htk_compat=True, window_type="hanning", num_mel_bins=128)
+    if melspec.shape[0] < 1024:
+        melspec = F.pad(melspec, (0, 0, 0, 1024 - melspec.shape[0]))
+    else:
+        melspec = melspec[:1024]
+    melspec = (melspec - MEAN) / (STD * 2)
+    return melspec
+def predict_class(x, sr, start, end):
+    x = torch.from_numpy(x) / (1 << 15)
+    if x.ndim > 1:
+        x = x.mean(-1)
+    assert x.ndim == 1
+    x = resample(x[int(start * sr) : int(end * sr)], sr, SAMPLING_RATE)
+    x = preprocess(x)
+    with torch.inference_mode():
+        logits = MODEL(x.view(1, 1, 1024, 128)).squeeze(0)
+    topk_probs, topk_classes = logits.sigmoid().topk(10)
+    preds = [[AUDIOSET_LABELS[cls], prob.item() * 100] for cls, prob in zip(topk_classes, topk_probs)]
+    return preds