Spaces:
Sleeping
Sleeping
fix stereo audio
Browse files- app.py +9 -13
- packages.txt +0 -1
app.py
CHANGED
@@ -1,14 +1,12 @@
|
|
1 |
import json
|
2 |
-
import shlex
|
3 |
-
import subprocess
|
4 |
|
5 |
import gradio as gr
|
6 |
-
import numpy as np
|
7 |
import requests
|
8 |
import timm
|
9 |
import torch
|
10 |
import torch.nn.functional as F
|
11 |
from torchaudio.compliance import kaldi
|
|
|
12 |
|
13 |
TAG = "gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k"
|
14 |
MODEL = timm.create_model(f"hf_hub:{TAG}", pretrained=True).eval()
|
@@ -21,12 +19,6 @@ MEAN = -4.2677393
|
|
21 |
STD = 4.5689974
|
22 |
|
23 |
|
24 |
-
def resample(x: np.ndarray, sr: int):
|
25 |
-
cmd = f"ffmpeg -ar {sr} -f s16le -i - -ar {SAMPLING_RATE} -f f32le -"
|
26 |
-
proc = subprocess.run(shlex.split(cmd), capture_output=True, input=x.tobytes())
|
27 |
-
return np.frombuffer(proc.stdout, dtype=np.float32)
|
28 |
-
|
29 |
-
|
30 |
def preprocess(x: torch.Tensor):
|
31 |
x = x - x.mean()
|
32 |
melspec = kaldi.fbank(x.unsqueeze(0), htk_compat=True, window_type="hanning", num_mel_bins=128)
|
@@ -35,7 +27,7 @@ def preprocess(x: torch.Tensor):
|
|
35 |
else:
|
36 |
melspec = melspec[:1024]
|
37 |
melspec = (melspec - MEAN) / (STD * 2)
|
38 |
-
return melspec.view(1,
|
39 |
|
40 |
|
41 |
def predict(audio, start):
|
@@ -43,11 +35,15 @@ def predict(audio, start):
|
|
43 |
if x.shape[0] < start * sr:
|
44 |
raise gr.Error(f"`start` ({start}) must be smaller than audio duration ({x.shape[0] / sr:.0f}s)")
|
45 |
|
46 |
-
x =
|
47 |
-
x
|
|
|
|
|
|
|
|
|
48 |
|
49 |
with torch.inference_mode():
|
50 |
-
logits = MODEL(
|
51 |
|
52 |
topk_probs, topk_classes = logits.sigmoid().topk(10)
|
53 |
return [[AUDIOSET_LABELS[cls], prob.item() * 100] for cls, prob in zip(topk_classes, topk_probs)]
|
|
|
1 |
import json
|
|
|
|
|
2 |
|
3 |
import gradio as gr
|
|
|
4 |
import requests
|
5 |
import timm
|
6 |
import torch
|
7 |
import torch.nn.functional as F
|
8 |
from torchaudio.compliance import kaldi
|
9 |
+
from torchaudio.functional import resample
|
10 |
|
11 |
TAG = "gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k"
|
12 |
MODEL = timm.create_model(f"hf_hub:{TAG}", pretrained=True).eval()
|
|
|
19 |
STD = 4.5689974
|
20 |
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
def preprocess(x: torch.Tensor):
|
23 |
x = x - x.mean()
|
24 |
melspec = kaldi.fbank(x.unsqueeze(0), htk_compat=True, window_type="hanning", num_mel_bins=128)
|
|
|
27 |
else:
|
28 |
melspec = melspec[:1024]
|
29 |
melspec = (melspec - MEAN) / (STD * 2)
|
30 |
+
return melspec.view(1, 1024, 128)
|
31 |
|
32 |
|
33 |
def predict(audio, start):
|
|
|
35 |
if x.shape[0] < start * sr:
|
36 |
raise gr.Error(f"`start` ({start}) must be smaller than audio duration ({x.shape[0] / sr:.0f}s)")
|
37 |
|
38 |
+
x = torch.from_numpy(x) / (1 << 15)
|
39 |
+
if x.ndim > 1:
|
40 |
+
x = x.mean(-1)
|
41 |
+
assert x.ndim == 1
|
42 |
+
x = resample(x[int(start * sr) :], sr, SAMPLING_RATE)
|
43 |
+
x = preprocess(x)
|
44 |
|
45 |
with torch.inference_mode():
|
46 |
+
logits = MODEL(x.unsqueeze(0)).squeeze(0)
|
47 |
|
48 |
topk_probs, topk_classes = logits.sigmoid().topk(10)
|
49 |
return [[AUDIOSET_LABELS[cls], prob.item() * 100] for cls, prob in zip(topk_classes, topk_probs)]
|
packages.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
ffmpeg
|
|
|
|