File size: 2,578 Bytes
c165076
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac1ef50
 
 
c165076
 
 
cef4bbb
c165076
 
ac1ef50
 
b7e6938
c165076
 
bbdbdcc
f21fcbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c165076
bbdbdcc
c165076
 
 
a83c171
c165076
 
 
 
f21fcbc
 
 
 
 
c165076
f21fcbc
b8c0bc8
c165076
d2ef383
c165076
 
 
 
 
a83c171
b7e6938
c165076
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio
import torchaudio
from fastai.vision.all import *
from fastai.learner import load_learner
from torchvision.utils import save_image
from huggingface_hub import hf_hub_download


model = load_learner(
    hf_hub_download("kurianbenoy/music_genre_classification_baseline", "model.pkl")
)

EXAMPLES_PATH = Path("./examples")
labels = model.dls.vocab

with open("article.md") as f:
    article = f.read()

interface_options = {
    "title": "Music Genre Classification",
    "description": "A simple baseline model for classifying music genres with fast.ai on [Kaggle competition data](https://www.kaggle.com/competitions/kaggle-pog-series-s01e02/data)",
    "article": article,
    "interpretation": "default",
    "layout": "horizontal",
    # Audio from validation file
    "examples": ["000003.ogg", "000032.ogg", "000038.ogg", "000050.ogg", "000103.ogg"],
    "allow_flagging": "never"
}

## Code from Dien Hoa Truong inference notebook: https://www.kaggle.com/code/dienhoa/inference-submission-music-genre
N_FFT = 2048
HOP_LEN = 1024


def create_spectrogram(filename):
    audio, sr = torchaudio.load(filename)
    specgram = torchaudio.transforms.MelSpectrogram(
        sample_rate=sr,
        n_fft=N_FFT,
        win_length=N_FFT,
        hop_length=HOP_LEN,
        center=True,
        pad_mode="reflect",
        power=2.0,
        norm="slaney",
        onesided=True,
        n_mels=224,
        mel_scale="htk",
    )(audio).mean(axis=0)
    specgram = torchaudio.transforms.AmplitudeToDB()(specgram)
    specgram = specgram - specgram.min()
    specgram = specgram / specgram.max()

    return specgram


def create_image(filename):
    specgram = create_spectrogram(filename)
    dest = Path("temp.png")
    save_image(specgram, "temp.png")


# Code from: https://huggingface.co/spaces/suvash/food-101-resnet50
def predict(img):
    img = PILImage.create(img)
    _pred, _pred_w_idx, probs = model.predict(img)
    # gradio doesn't support tensors, so converting to float
    labels_probs = {labels[i]: float(probs[i]) for i, _ in enumerate(labels)}
    return labels_probs


def end2endpipeline(filename):
    create_image(filename)
    return predict("temp.png")


demo = gradio.Interface(
    fn=end2endpipeline,
    inputs=gradio.inputs.Audio(source="upload", type="filepath"),
    outputs=gradio.outputs.Label(num_top_classes=5),
    **interface_options,
)

launch_options = {
    "enable_queue": True,
    "share": False,
    # thanks Alex for pointing this option to cache examples
    "cache_examples": True,
}

demo.launch(**launch_options)