Spaces:

team7
/

talk_with_wind

Runtime error

App Files Files Community

aps commited on Jan 8, 2023

Commit

4848335

•

1 Parent(s): 9463f01

Commit efficientat

Browse files

Files changed (16) hide show

.gitattributes +2 -0
.gitignore +1 -0
app.py +52 -3
efficientat/helpers/flop_count.py +162 -0
efficientat/helpers/init.py +33 -0
efficientat/helpers/utils.py +104 -0
efficientat/metadata/class_labels_indices.csv +528 -0
efficientat/models/MobileNetV3.py +349 -0
efficientat/models/attention_pooling.py +56 -0
efficientat/models/block_types.py +182 -0
efficientat/models/preprocess.py +67 -0
efficientat/models/utils.py +59 -0
efficientat/resources/README.md +1 -0
efficientat/resources/metro_station-paris.wav +3 -0
packages.txt +1 -0
requirements.txt +4 -0

.gitattributes CHANGED Viewed

@@ -31,4 +31,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+efficientat/resources/metro_station-paris.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py CHANGED Viewed

@@ -1,4 +1,53 @@
 import gradio as gr
 def formatted_message(audio_length, audio_class, userText):
     prefix = '''You are going to act as a magical tool that allows for humans to communicate with non-human entities like
@@ -33,7 +82,7 @@ def call_api(message):
 demo = gr.Interface(
-    call_api,
-    gr.Audio(source="microphone"),
-    gr.Audio(),
 ).launch(debug=True)

 import gradio as gr
+import torch
+import numpy as np
+import librosa
+from efficientat.models.MobileNetV3 import get_model as get_mobilenet, get_ensemble_model
+from efficientat.models.preprocess import AugmentMelSTFT
+from efficientat.helpers.utils import NAME_TO_WIDTH, labels
+from torch import autocast
+from contextlib import nullcontext
+MODEL_NAME = "mn40_as"
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+model = get_mobilenet(width_mult=NAME_TO_WIDTH(MODEL_NAME), pretrained_name=MODEL_NAME)
+model.to(device)
+model.eval()
+def audio_tag(
+    audio_path,
+    sample_rate=32000,
+    window_size=800,
+    hop_size=320,
+    n_mels=128,
+    cuda=True,
+):
+    (waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+    mel = AugmentMelSTFT(n_mels=n_mels, sr=sample_rate, win_length=window_size, hopsize=hop_size)
+    mel.to(device)
+    mel.eval()
+    waveform = torch.from_numpy(waveform[None, :]).to(device)
+    # our models are trained in half precision mode (torch.float16)
+    # run on cuda with torch.float16 to get the best performance
+    # running on cpu with torch.float32 gives similar performance, using torch.bfloat16 is worse
+    with torch.no_grad(), autocast(device_type=device.type) if cuda and torch.cuda.is_available() else nullcontext():
+        spec = mel(waveform)
+        preds, features = model(spec.unsqueeze(0))
+    preds = torch.sigmoid(preds.float()).squeeze().cpu().numpy()
+    sorted_indexes = np.argsort(preds)[::-1]
+    output = {}
+    # Print audio tagging top probabilities
+    for k in range(10):
+        output[sorted_indexes[k]] = labels[sorted_indexes[k]]
+    return "\n".join(output.values())
 def formatted_message(audio_length, audio_class, userText):
     prefix = '''You are going to act as a magical tool that allows for humans to communicate with non-human entities like
 demo = gr.Interface(
+    audio_tag,
+    gr.Audio(source="upload", type="filepath", label="Your audio"),
+    gr.Textbox(),
 ).launch(debug=True)

efficientat/helpers/flop_count.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch
+import torch.nn as nn
+# adapted from PANNs (https://github.com/qiuqiangkong/audioset_tagging_cnn)
+def count_macs(model, spec_size):
+    list_conv2d = []
+    def conv2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        assert batch_size == 1
+        output_channels, output_height, output_width = output[0].size()
+        kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups)
+        bias_ops = 1 if self.bias is not None else 0
+        params = output_channels * (kernel_ops + bias_ops)
+        # overall macs count is:
+        # kernel**2 * in_channels/groups * out_channels * out_width * out_height
+        macs = batch_size * params * output_height * output_width
+        list_conv2d.append(macs)
+    list_linear = []
+    def linear_hook(self, input, output):
+        batch_size = input[0].size(0) if input[0].dim() == 2 else 1
+        assert batch_size == 1
+        weight_ops = self.weight.nelement()
+        bias_ops = self.bias.nelement()
+        # overall macs count is equal to the number of parameters in layer
+        macs = batch_size * (weight_ops + bias_ops)
+        list_linear.append(macs)
+    def foo(net):
+        if net.__class__.__name__ == 'Conv2dStaticSamePadding':
+            net.register_forward_hook(conv2d_hook)
+        childrens = list(net.children())
+        if not childrens:
+            if isinstance(net, nn.Conv2d):
+                net.register_forward_hook(conv2d_hook)
+            elif isinstance(net, nn.Linear):
+                net.register_forward_hook(linear_hook)
+            else:
+                print('Warning: flop of module {} is not counted!'.format(net))
+            return
+        for c in childrens:
+            foo(c)
+    # Register hook
+    foo(model)
+    device = next(model.parameters()).device
+    input = torch.rand(spec_size).to(device)
+    with torch.no_grad():
+        model(input)
+    total_macs = sum(list_conv2d) + sum(list_linear)
+    print("*************Computational Complexity (multiply-adds) **************")
+    print("Number of Convolutional Layers: ", len(list_conv2d))
+    print("Number of Linear Layers: ", len(list_linear))
+    print("Relative Share of Convolutional Layers: {:.2f}".format((sum(list_conv2d) / total_macs)))
+    print("Relative Share of Linear Layers: {:.2f}".format(sum(list_linear) / total_macs))
+    print("Total MACs (multiply-accumulate operations in Billions): {:.2f}".format(total_macs/10**9))
+    print("********************************************************************")
+    return total_macs
+def count_macs_transformer(model, spec_size):
+    """Count macs. Code modified from others' implementation.
+        """
+    list_conv2d = []
+    def conv2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        assert batch_size == 1
+        output_channels, output_height, output_width = output[0].size()
+        kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups)
+        bias_ops = 1 if self.bias is not None else 0
+        params = output_channels * (kernel_ops + bias_ops)
+        # overall macs count is:
+        # kernel**2 * in_channels/groups * out_channels * out_width * out_height
+        macs = batch_size * params * output_height * output_width
+        list_conv2d.append(macs)
+    list_linear = []
+    def linear_hook(self, input, output):
+        batch_size = input[0].size(0) if input[0].dim() >= 2 else 1
+        assert batch_size == 1
+        if input[0].dim() == 3:
+            # (batch size, sequence length, embeddings size)
+            batch_size, seq_len, embed_size = input[0].size()
+            weight_ops = self.weight.nelement()
+            bias_ops = self.bias.nelement() if self.bias is not None else 0
+            # linear layer applied position-wise, multiply with sequence length
+            macs = batch_size * (weight_ops + bias_ops) * seq_len
+        else:
+            # classification head
+            # (batch size, embeddings size)
+            batch_size, embed_size = input[0].size()
+            weight_ops = self.weight.nelement()
+            bias_ops = self.bias.nelement() if self.bias is not None else 0
+            # overall macs count is equal to the number of parameters in layer
+            macs = batch_size * (weight_ops + bias_ops)
+        list_linear.append(macs)
+    list_att = []
+    def attention_hook(self, input, output):
+        # here we only calculate the attention macs; linear layers are processed in linear_hook
+        batch_size, seq_len, embed_size = input[0].size()
+        # 2 times embed_size * seq_len**2
+        # - computing the attention matrix: embed_size * seq_len**2
+        # - multiply attention matrix with value matrix: embed_size * seq_len**2
+        macs = batch_size * embed_size * seq_len * seq_len * 2
+        list_att.append(macs)
+    def foo(net):
+        childrens = list(net.children())
+        if net.__class__.__name__ == "MultiHeadAttention":
+            net.register_forward_hook(attention_hook)
+        if not childrens:
+            if isinstance(net, nn.Conv2d):
+                net.register_forward_hook(conv2d_hook)
+            elif isinstance(net, nn.Linear):
+                net.register_forward_hook(linear_hook)
+            else:
+                print('Warning: flop of module {} is not counted!'.format(net))
+            return
+        for c in childrens:
+            foo(c)
+    # Register hook
+    foo(model)
+    device = next(model.parameters()).device
+    input = torch.rand(spec_size).to(device)
+    with torch.no_grad():
+        model(input)
+    total_macs = sum(list_conv2d) + sum(list_linear) + sum(list_att)
+    print("*************Computational Complexity (multiply-adds) **************")
+    print("Number of Convolutional Layers: ", len(list_conv2d))
+    print("Number of Linear Layers: ", len(list_linear))
+    print("Number of Attention Layers: ", len(list_att))
+    print("Relative Share of Convolutional Layers: {:.2f}".format((sum(list_conv2d) / total_macs)))
+    print("Relative Share of Linear Layers: {:.2f}".format(sum(list_linear) / total_macs))
+    print("Relative Share of Attention Layers: {:.2f}".format(sum(list_att) / total_macs))
+    print("Total MACs (multiply-accumulate operations in Billions): {:.2f}".format(total_macs/10**9))
+    print("********************************************************************")
+    return total_macs

efficientat/helpers/init.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+import numpy as np
+import random
+def worker_init_fn(wid):
+    seed_sequence = np.random.SeedSequence(
+        [torch.initial_seed(), wid]
+    )
+    to_seed = spawn_get(seed_sequence, 2, dtype=int)
+    torch.random.manual_seed(to_seed)
+    np_seed = spawn_get(seed_sequence, 2, dtype=np.ndarray)
+    np.random.seed(np_seed)
+    py_seed = spawn_get(seed_sequence, 2, dtype=int)
+    random.seed(py_seed)
+def spawn_get(seedseq, n_entropy, dtype):
+    child = seedseq.spawn(1)[0]
+    state = child.generate_state(n_entropy, dtype=np.uint32)
+    if dtype == np.ndarray:
+        return state
+    elif dtype == int:
+        state_as_int = 0
+        for shift, s in enumerate(state):
+            state_as_int = state_as_int + int((2 ** (32 * shift) * s))
+        return state_as_int
+    else:
+        raise ValueError(f'not a valid dtype "{dtype}"')

efficientat/helpers/utils.py ADDED Viewed

	@@ -0,0 +1,104 @@

+def NAME_TO_WIDTH(name):
+    map = {
+        'mn04': 0.4,
+        'mn05': 0.5,
+        'mn10': 1.0,
+        'mn20': 2.0,
+        'mn30': 3.0,
+        'mn40': 4.0
+    }
+    try:
+        w = map[name[:4]]
+    except:
+        w = 1.0
+    return w
+import csv
+# Load label
+with open('efficientat/metadata/class_labels_indices.csv', 'r') as f:
+    reader = csv.reader(f, delimiter=',')
+    lines = list(reader)
+labels = []
+ids = []    # Each label has a unique id such as "/m/068hy"
+for i1 in range(1, len(lines)):
+    id = lines[i1][1]
+    label = lines[i1][2]
+    ids.append(id)
+    labels.append(label)
+classes_num = len(labels)
+import numpy as np
+def exp_warmup_linear_down(warmup, rampdown_length, start_rampdown, last_value):
+    rampup = exp_rampup(warmup)
+    rampdown = linear_rampdown(rampdown_length, start_rampdown, last_value)
+    def wrapper(epoch):
+        return rampup(epoch) * rampdown(epoch)
+    return wrapper
+def exp_rampup(rampup_length):
+    """Exponential rampup from https://arxiv.org/abs/1610.02242"""
+    def wrapper(epoch):
+        if epoch < rampup_length:
+            epoch = np.clip(epoch, 0.5, rampup_length)
+            phase = 1.0 - epoch / rampup_length
+            return float(np.exp(-5.0 * phase * phase))
+        else:
+            return 1.0
+    return wrapper
+def linear_rampdown(rampdown_length, start=0, last_value=0):
+    def wrapper(epoch):
+        if epoch <= start:
+            return 1.
+        elif epoch - start < rampdown_length:
+            return last_value + (1. - last_value) * (rampdown_length - epoch + start) / rampdown_length
+        else:
+            return last_value
+    return wrapper
+import torch
+def mixup(size, alpha):
+    rn_indices = torch.randperm(size)
+    lambd = np.random.beta(alpha, alpha, size).astype(np.float32)
+    lambd = np.concatenate([lambd[:, None], 1 - lambd[:, None]], 1).max(1)
+    lam = torch.FloatTensor(lambd)
+    return rn_indices, lam
+from torch.distributions.beta import Beta
+def mixstyle(x, p=0.4, alpha=0.4, eps=1e-6, mix_labels=False):
+    if np.random.rand() > p:
+        return x
+    batch_size = x.size(0)
+    # changed from dim=[2,3] to dim=[1,3] - from channel-wise statistics to frequency-wise statistics
+    f_mu = x.mean(dim=[1, 3], keepdim=True)
+    f_var = x.var(dim=[1, 3], keepdim=True)
+    f_sig = (f_var + eps).sqrt()  # compute instance standard deviation
+    f_mu, f_sig = f_mu.detach(), f_sig.detach()  # block gradients
+    x_normed = (x - f_mu) / f_sig  # normalize input
+    lmda = Beta(alpha, alpha).sample((batch_size, 1, 1, 1)).to(x.device)  # sample instance-wise convex weights
+    perm = torch.randperm(batch_size).to(x.device)  # generate shuffling indices
+    f_mu_perm, f_sig_perm = f_mu[perm], f_sig[perm]  # shuffling
+    mu_mix = f_mu * lmda + f_mu_perm * (1 - lmda)  # generate mixed mean
+    sig_mix = f_sig * lmda + f_sig_perm * (1 - lmda)  # generate mixed standard deviation
+    x = x_normed * sig_mix + mu_mix  # denormalize input using the mixed statistics
+    if mix_labels:
+        return x, perm, lmda
+    return x

efficientat/metadata/class_labels_indices.csv ADDED Viewed

	@@ -0,0 +1,528 @@

+index,mid,display_name
+0,/m/09x0r,"Speech"
+1,/m/05zppz,"Male speech, man speaking"
+2,/m/02zsn,"Female speech, woman speaking"
+3,/m/0ytgt,"Child speech, kid speaking"
+4,/m/01h8n0,"Conversation"
+5,/m/02qldy,"Narration, monologue"
+6,/m/0261r1,"Babbling"
+7,/m/0brhx,"Speech synthesizer"
+8,/m/07p6fty,"Shout"
+9,/m/07q4ntr,"Bellow"
+10,/m/07rwj3x,"Whoop"
+11,/m/07sr1lc,"Yell"
+12,/m/04gy_2,"Battle cry"
+13,/t/dd00135,"Children shouting"
+14,/m/03qc9zr,"Screaming"
+15,/m/02rtxlg,"Whispering"
+16,/m/01j3sz,"Laughter"
+17,/t/dd00001,"Baby laughter"
+18,/m/07r660_,"Giggle"
+19,/m/07s04w4,"Snicker"
+20,/m/07sq110,"Belly laugh"
+21,/m/07rgt08,"Chuckle, chortle"
+22,/m/0463cq4,"Crying, sobbing"
+23,/t/dd00002,"Baby cry, infant cry"
+24,/m/07qz6j3,"Whimper"
+25,/m/07qw_06,"Wail, moan"
+26,/m/07plz5l,"Sigh"
+27,/m/015lz1,"Singing"
+28,/m/0l14jd,"Choir"
+29,/m/01swy6,"Yodeling"
+30,/m/02bk07,"Chant"
+31,/m/01c194,"Mantra"
+32,/t/dd00003,"Male singing"
+33,/t/dd00004,"Female singing"
+34,/t/dd00005,"Child singing"
+35,/t/dd00006,"Synthetic singing"
+36,/m/06bxc,"Rapping"
+37,/m/02fxyj,"Humming"
+38,/m/07s2xch,"Groan"
+39,/m/07r4k75,"Grunt"
+40,/m/01w250,"Whistling"
+41,/m/0lyf6,"Breathing"
+42,/m/07mzm6,"Wheeze"
+43,/m/01d3sd,"Snoring"
+44,/m/07s0dtb,"Gasp"
+45,/m/07pyy8b,"Pant"
+46,/m/07q0yl5,"Snort"
+47,/m/01b_21,"Cough"
+48,/m/0dl9sf8,"Throat clearing"
+49,/m/01hsr_,"Sneeze"
+50,/m/07ppn3j,"Sniff"
+51,/m/06h7j,"Run"
+52,/m/07qv_x_,"Shuffle"
+53,/m/07pbtc8,"Walk, footsteps"
+54,/m/03cczk,"Chewing, mastication"
+55,/m/07pdhp0,"Biting"
+56,/m/0939n_,"Gargling"
+57,/m/01g90h,"Stomach rumble"
+58,/m/03q5_w,"Burping, eructation"
+59,/m/02p3nc,"Hiccup"
+60,/m/02_nn,"Fart"
+61,/m/0k65p,"Hands"
+62,/m/025_jnm,"Finger snapping"
+63,/m/0l15bq,"Clapping"
+64,/m/01jg02,"Heart sounds, heartbeat"
+65,/m/01jg1z,"Heart murmur"
+66,/m/053hz1,"Cheering"
+67,/m/028ght,"Applause"
+68,/m/07rkbfh,"Chatter"
+69,/m/03qtwd,"Crowd"
+70,/m/07qfr4h,"Hubbub, speech noise, speech babble"
+71,/t/dd00013,"Children playing"
+72,/m/0jbk,"Animal"
+73,/m/068hy,"Domestic animals, pets"
+74,/m/0bt9lr,"Dog"
+75,/m/05tny_,"Bark"
+76,/m/07r_k2n,"Yip"
+77,/m/07qf0zm,"Howl"
+78,/m/07rc7d9,"Bow-wow"
+79,/m/0ghcn6,"Growling"
+80,/t/dd00136,"Whimper (dog)"
+81,/m/01yrx,"Cat"
+82,/m/02yds9,"Purr"
+83,/m/07qrkrw,"Meow"
+84,/m/07rjwbb,"Hiss"
+85,/m/07r81j2,"Caterwaul"
+86,/m/0ch8v,"Livestock, farm animals, working animals"
+87,/m/03k3r,"Horse"
+88,/m/07rv9rh,"Clip-clop"
+89,/m/07q5rw0,"Neigh, whinny"
+90,/m/01xq0k1,"Cattle, bovinae"
+91,/m/07rpkh9,"Moo"
+92,/m/0239kh,"Cowbell"
+93,/m/068zj,"Pig"
+94,/t/dd00018,"Oink"
+95,/m/03fwl,"Goat"
+96,/m/07q0h5t,"Bleat"
+97,/m/07bgp,"Sheep"
+98,/m/025rv6n,"Fowl"
+99,/m/09b5t,"Chicken, rooster"
+100,/m/07st89h,"Cluck"
+101,/m/07qn5dc,"Crowing, cock-a-doodle-doo"
+102,/m/01rd7k,"Turkey"
+103,/m/07svc2k,"Gobble"
+104,/m/09ddx,"Duck"
+105,/m/07qdb04,"Quack"
+106,/m/0dbvp,"Goose"
+107,/m/07qwf61,"Honk"
+108,/m/01280g,"Wild animals"
+109,/m/0cdnk,"Roaring cats (lions, tigers)"
+110,/m/04cvmfc,"Roar"
+111,/m/015p6,"Bird"
+112,/m/020bb7,"Bird vocalization, bird call, bird song"
+113,/m/07pggtn,"Chirp, tweet"
+114,/m/07sx8x_,"Squawk"
+115,/m/0h0rv,"Pigeon, dove"
+116,/m/07r_25d,"Coo"
+117,/m/04s8yn,"Crow"
+118,/m/07r5c2p,"Caw"
+119,/m/09d5_,"Owl"
+120,/m/07r_80w,"Hoot"
+121,/m/05_wcq,"Bird flight, flapping wings"
+122,/m/01z5f,"Canidae, dogs, wolves"
+123,/m/06hps,"Rodents, rats, mice"
+124,/m/04rmv,"Mouse"
+125,/m/07r4gkf,"Patter"
+126,/m/03vt0,"Insect"
+127,/m/09xqv,"Cricket"
+128,/m/09f96,"Mosquito"
+129,/m/0h2mp,"Fly, housefly"
+130,/m/07pjwq1,"Buzz"
+131,/m/01h3n,"Bee, wasp, etc."
+132,/m/09ld4,"Frog"
+133,/m/07st88b,"Croak"
+134,/m/078jl,"Snake"
+135,/m/07qn4z3,"Rattle"
+136,/m/032n05,"Whale vocalization"
+137,/m/04rlf,"Music"
+138,/m/04szw,"Musical instrument"
+139,/m/0fx80y,"Plucked string instrument"
+140,/m/0342h,"Guitar"
+141,/m/02sgy,"Electric guitar"
+142,/m/018vs,"Bass guitar"
+143,/m/042v_gx,"Acoustic guitar"
+144,/m/06w87,"Steel guitar, slide guitar"
+145,/m/01glhc,"Tapping (guitar technique)"
+146,/m/07s0s5r,"Strum"
+147,/m/018j2,"Banjo"
+148,/m/0jtg0,"Sitar"
+149,/m/04rzd,"Mandolin"
+150,/m/01bns_,"Zither"
+151,/m/07xzm,"Ukulele"
+152,/m/05148p4,"Keyboard (musical)"
+153,/m/05r5c,"Piano"
+154,/m/01s0ps,"Electric piano"
+155,/m/013y1f,"Organ"
+156,/m/03xq_f,"Electronic organ"
+157,/m/03gvt,"Hammond organ"
+158,/m/0l14qv,"Synthesizer"
+159,/m/01v1d8,"Sampler"
+160,/m/03q5t,"Harpsichord"
+161,/m/0l14md,"Percussion"
+162,/m/02hnl,"Drum kit"
+163,/m/0cfdd,"Drum machine"
+164,/m/026t6,"Drum"
+165,/m/06rvn,"Snare drum"
+166,/m/03t3fj,"Rimshot"
+167,/m/02k_mr,"Drum roll"
+168,/m/0bm02,"Bass drum"
+169,/m/011k_j,"Timpani"
+170,/m/01p970,"Tabla"
+171,/m/01qbl,"Cymbal"
+172,/m/03qtq,"Hi-hat"
+173,/m/01sm1g,"Wood block"
+174,/m/07brj,"Tambourine"
+175,/m/05r5wn,"Rattle (instrument)"
+176,/m/0xzly,"Maraca"
+177,/m/0mbct,"Gong"
+178,/m/016622,"Tubular bells"
+179,/m/0j45pbj,"Mallet percussion"
+180,/m/0dwsp,"Marimba, xylophone"
+181,/m/0dwtp,"Glockenspiel"
+182,/m/0dwt5,"Vibraphone"
+183,/m/0l156b,"Steelpan"
+184,/m/05pd6,"Orchestra"
+185,/m/01kcd,"Brass instrument"
+186,/m/0319l,"French horn"
+187,/m/07gql,"Trumpet"
+188,/m/07c6l,"Trombone"
+189,/m/0l14_3,"Bowed string instrument"
+190,/m/02qmj0d,"String section"
+191,/m/07y_7,"Violin, fiddle"
+192,/m/0d8_n,"Pizzicato"
+193,/m/01xqw,"Cello"
+194,/m/02fsn,"Double bass"
+195,/m/085jw,"Wind instrument, woodwind instrument"
+196,/m/0l14j_,"Flute"
+197,/m/06ncr,"Saxophone"
+198,/m/01wy6,"Clarinet"
+199,/m/03m5k,"Harp"
+200,/m/0395lw,"Bell"
+201,/m/03w41f,"Church bell"
+202,/m/027m70_,"Jingle bell"
+203,/m/0gy1t2s,"Bicycle bell"
+204,/m/07n_g,"Tuning fork"
+205,/m/0f8s22,"Chime"
+206,/m/026fgl,"Wind chime"
+207,/m/0150b9,"Change ringing (campanology)"
+208,/m/03qjg,"Harmonica"
+209,/m/0mkg,"Accordion"
+210,/m/0192l,"Bagpipes"
+211,/m/02bxd,"Didgeridoo"
+212,/m/0l14l2,"Shofar"
+213,/m/07kc_,"Theremin"
+214,/m/0l14t7,"Singing bowl"
+215,/m/01hgjl,"Scratching (performance technique)"
+216,/m/064t9,"Pop music"
+217,/m/0glt670,"Hip hop music"
+218,/m/02cz_7,"Beatboxing"
+219,/m/06by7,"Rock music"
+220,/m/03lty,"Heavy metal"
+221,/m/05r6t,"Punk rock"
+222,/m/0dls3,"Grunge"
+223,/m/0dl5d,"Progressive rock"
+224,/m/07sbbz2,"Rock and roll"
+225,/m/05w3f,"Psychedelic rock"
+226,/m/06j6l,"Rhythm and blues"
+227,/m/0gywn,"Soul music"
+228,/m/06cqb,"Reggae"
+229,/m/01lyv,"Country"
+230,/m/015y_n,"Swing music"
+231,/m/0gg8l,"Bluegrass"
+232,/m/02x8m,"Funk"
+233,/m/02w4v,"Folk music"
+234,/m/06j64v,"Middle Eastern music"
+235,/m/03_d0,"Jazz"
+236,/m/026z9,"Disco"
+237,/m/0ggq0m,"Classical music"
+238,/m/05lls,"Opera"
+239,/m/02lkt,"Electronic music"
+240,/m/03mb9,"House music"
+241,/m/07gxw,"Techno"
+242,/m/07s72n,"Dubstep"
+243,/m/0283d,"Drum and bass"
+244,/m/0m0jc,"Electronica"
+245,/m/08cyft,"Electronic dance music"
+246,/m/0fd3y,"Ambient music"
+247,/m/07lnk,"Trance music"
+248,/m/0g293,"Music of Latin America"
+249,/m/0ln16,"Salsa music"
+250,/m/0326g,"Flamenco"
+251,/m/0155w,"Blues"
+252,/m/05fw6t,"Music for children"
+253,/m/02v2lh,"New-age music"
+254,/m/0y4f8,"Vocal music"
+255,/m/0z9c,"A capella"
+256,/m/0164x2,"Music of Africa"
+257,/m/0145m,"Afrobeat"
+258,/m/02mscn,"Christian music"
+259,/m/016cjb,"Gospel music"
+260,/m/028sqc,"Music of Asia"
+261,/m/015vgc,"Carnatic music"
+262,/m/0dq0md,"Music of Bollywood"
+263,/m/06rqw,"Ska"
+264,/m/02p0sh1,"Traditional music"
+265,/m/05rwpb,"Independent music"
+266,/m/074ft,"Song"
+267,/m/025td0t,"Background music"
+268,/m/02cjck,"Theme music"
+269,/m/03r5q_,"Jingle (music)"
+270,/m/0l14gg,"Soundtrack music"
+271,/m/07pkxdp,"Lullaby"
+272,/m/01z7dr,"Video game music"
+273,/m/0140xf,"Christmas music"
+274,/m/0ggx5q,"Dance music"
+275,/m/04wptg,"Wedding music"
+276,/t/dd00031,"Happy music"
+277,/t/dd00032,"Funny music"
+278,/t/dd00033,"Sad music"
+279,/t/dd00034,"Tender music"
+280,/t/dd00035,"Exciting music"
+281,/t/dd00036,"Angry music"
+282,/t/dd00037,"Scary music"
+283,/m/03m9d0z,"Wind"
+284,/m/09t49,"Rustling leaves"
+285,/t/dd00092,"Wind noise (microphone)"
+286,/m/0jb2l,"Thunderstorm"
+287,/m/0ngt1,"Thunder"
+288,/m/0838f,"Water"
+289,/m/06mb1,"Rain"
+290,/m/07r10fb,"Raindrop"
+291,/t/dd00038,"Rain on surface"
+292,/m/0j6m2,"Stream"
+293,/m/0j2kx,"Waterfall"
+294,/m/05kq4,"Ocean"
+295,/m/034srq,"Waves, surf"
+296,/m/06wzb,"Steam"
+297,/m/07swgks,"Gurgling"
+298,/m/02_41,"Fire"
+299,/m/07pzfmf,"Crackle"
+300,/m/07yv9,"Vehicle"
+301,/m/019jd,"Boat, Water vehicle"
+302,/m/0hsrw,"Sailboat, sailing ship"
+303,/m/056ks2,"Rowboat, canoe, kayak"
+304,/m/02rlv9,"Motorboat, speedboat"
+305,/m/06q74,"Ship"
+306,/m/012f08,"Motor vehicle (road)"
+307,/m/0k4j,"Car"
+308,/m/0912c9,"Vehicle horn, car horn, honking"
+309,/m/07qv_d5,"Toot"
+310,/m/02mfyn,"Car alarm"
+311,/m/04gxbd,"Power windows, electric windows"
+312,/m/07rknqz,"Skidding"
+313,/m/0h9mv,"Tire squeal"
+314,/t/dd00134,"Car passing by"
+315,/m/0ltv,"Race car, auto racing"
+316,/m/07r04,"Truck"
+317,/m/0gvgw0,"Air brake"
+318,/m/05x_td,"Air horn, truck horn"
+319,/m/02rhddq,"Reversing beeps"
+320,/m/03cl9h,"Ice cream truck, ice cream van"
+321,/m/01bjv,"Bus"
+322,/m/03j1ly,"Emergency vehicle"
+323,/m/04qvtq,"Police car (siren)"
+324,/m/012n7d,"Ambulance (siren)"
+325,/m/012ndj,"Fire engine, fire truck (siren)"
+326,/m/04_sv,"Motorcycle"
+327,/m/0btp2,"Traffic noise, roadway noise"
+328,/m/06d_3,"Rail transport"
+329,/m/07jdr,"Train"
+330,/m/04zmvq,"Train whistle"
+331,/m/0284vy3,"Train horn"
+332,/m/01g50p,"Railroad car, train wagon"
+333,/t/dd00048,"Train wheels squealing"
+334,/m/0195fx,"Subway, metro, underground"
+335,/m/0k5j,"Aircraft"
+336,/m/014yck,"Aircraft engine"
+337,/m/04229,"Jet engine"
+338,/m/02l6bg,"Propeller, airscrew"
+339,/m/09ct_,"Helicopter"
+340,/m/0cmf2,"Fixed-wing aircraft, airplane"
+341,/m/0199g,"Bicycle"
+342,/m/06_fw,"Skateboard"
+343,/m/02mk9,"Engine"
+344,/t/dd00065,"Light engine (high frequency)"
+345,/m/08j51y,"Dental drill, dentist's drill"
+346,/m/01yg9g,"Lawn mower"
+347,/m/01j4z9,"Chainsaw"
+348,/t/dd00066,"Medium engine (mid frequency)"
+349,/t/dd00067,"Heavy engine (low frequency)"
+350,/m/01h82_,"Engine knocking"
+351,/t/dd00130,"Engine starting"
+352,/m/07pb8fc,"Idling"
+353,/m/07q2z82,"Accelerating, revving, vroom"
+354,/m/02dgv,"Door"
+355,/m/03wwcy,"Doorbell"
+356,/m/07r67yg,"Ding-dong"
+357,/m/02y_763,"Sliding door"
+358,/m/07rjzl8,"Slam"
+359,/m/07r4wb8,"Knock"
+360,/m/07qcpgn,"Tap"
+361,/m/07q6cd_,"Squeak"
+362,/m/0642b4,"Cupboard open or close"
+363,/m/0fqfqc,"Drawer open or close"
+364,/m/04brg2,"Dishes, pots, and pans"
+365,/m/023pjk,"Cutlery, silverware"
+366,/m/07pn_8q,"Chopping (food)"
+367,/m/0dxrf,"Frying (food)"
+368,/m/0fx9l,"Microwave oven"
+369,/m/02pjr4,"Blender"
+370,/m/02jz0l,"Water tap, faucet"
+371,/m/0130jx,"Sink (filling or washing)"
+372,/m/03dnzn,"Bathtub (filling or washing)"
+373,/m/03wvsk,"Hair dryer"
+374,/m/01jt3m,"Toilet flush"
+375,/m/012xff,"Toothbrush"
+376,/m/04fgwm,"Electric toothbrush"
+377,/m/0d31p,"Vacuum cleaner"
+378,/m/01s0vc,"Zipper (clothing)"
+379,/m/03v3yw,"Keys jangling"
+380,/m/0242l,"Coin (dropping)"
+381,/m/01lsmm,"Scissors"
+382,/m/02g901,"Electric shaver, electric razor"
+383,/m/05rj2,"Shuffling cards"
+384,/m/0316dw,"Typing"
+385,/m/0c2wf,"Typewriter"
+386,/m/01m2v,"Computer keyboard"
+387,/m/081rb,"Writing"
+388,/m/07pp_mv,"Alarm"
+389,/m/07cx4,"Telephone"
+390,/m/07pp8cl,"Telephone bell ringing"
+391,/m/01hnzm,"Ringtone"
+392,/m/02c8p,"Telephone dialing, DTMF"
+393,/m/015jpf,"Dial tone"
+394,/m/01z47d,"Busy signal"
+395,/m/046dlr,"Alarm clock"
+396,/m/03kmc9,"Siren"
+397,/m/0dgbq,"Civil defense siren"
+398,/m/030rvx,"Buzzer"
+399,/m/01y3hg,"Smoke detector, smoke alarm"
+400,/m/0c3f7m,"Fire alarm"
+401,/m/04fq5q,"Foghorn"
+402,/m/0l156k,"Whistle"
+403,/m/06hck5,"Steam whistle"
+404,/t/dd00077,"Mechanisms"
+405,/m/02bm9n,"Ratchet, pawl"
+406,/m/01x3z,"Clock"
+407,/m/07qjznt,"Tick"
+408,/m/07qjznl,"Tick-tock"
+409,/m/0l7xg,"Gears"
+410,/m/05zc1,"Pulleys"
+411,/m/0llzx,"Sewing machine"
+412,/m/02x984l,"Mechanical fan"
+413,/m/025wky1,"Air conditioning"
+414,/m/024dl,"Cash register"
+415,/m/01m4t,"Printer"
+416,/m/0dv5r,"Camera"
+417,/m/07bjf,"Single-lens reflex camera"
+418,/m/07k1x,"Tools"
+419,/m/03l9g,"Hammer"
+420,/m/03p19w,"Jackhammer"
+421,/m/01b82r,"Sawing"
+422,/m/02p01q,"Filing (rasp)"
+423,/m/023vsd,"Sanding"
+424,/m/0_ksk,"Power tool"
+425,/m/01d380,"Drill"
+426,/m/014zdl,"Explosion"
+427,/m/032s66,"Gunshot, gunfire"
+428,/m/04zjc,"Machine gun"
+429,/m/02z32qm,"Fusillade"
+430,/m/0_1c,"Artillery fire"
+431,/m/073cg4,"Cap gun"
+432,/m/0g6b5,"Fireworks"
+433,/g/122z_qxw,"Firecracker"
+434,/m/07qsvvw,"Burst, pop"
+435,/m/07pxg6y,"Eruption"
+436,/m/07qqyl4,"Boom"
+437,/m/083vt,"Wood"
+438,/m/07pczhz,"Chop"
+439,/m/07pl1bw,"Splinter"
+440,/m/07qs1cx,"Crack"
+441,/m/039jq,"Glass"
+442,/m/07q7njn,"Chink, clink"
+443,/m/07rn7sz,"Shatter"
+444,/m/04k94,"Liquid"
+445,/m/07rrlb6,"Splash, splatter"
+446,/m/07p6mqd,"Slosh"
+447,/m/07qlwh6,"Squish"
+448,/m/07r5v4s,"Drip"
+449,/m/07prgkl,"Pour"
+450,/m/07pqc89,"Trickle, dribble"
+451,/t/dd00088,"Gush"
+452,/m/07p7b8y,"Fill (with liquid)"
+453,/m/07qlf79,"Spray"
+454,/m/07ptzwd,"Pump (liquid)"
+455,/m/07ptfmf,"Stir"
+456,/m/0dv3j,"Boiling"
+457,/m/0790c,"Sonar"
+458,/m/0dl83,"Arrow"
+459,/m/07rqsjt,"Whoosh, swoosh, swish"
+460,/m/07qnq_y,"Thump, thud"
+461,/m/07rrh0c,"Thunk"
+462,/m/0b_fwt,"Electronic tuner"
+463,/m/02rr_,"Effects unit"
+464,/m/07m2kt,"Chorus effect"
+465,/m/018w8,"Basketball bounce"
+466,/m/07pws3f,"Bang"
+467,/m/07ryjzk,"Slap, smack"
+468,/m/07rdhzs,"Whack, thwack"
+469,/m/07pjjrj,"Smash, crash"
+470,/m/07pc8lb,"Breaking"
+471,/m/07pqn27,"Bouncing"
+472,/m/07rbp7_,"Whip"
+473,/m/07pyf11,"Flap"
+474,/m/07qb_dv,"Scratch"
+475,/m/07qv4k0,"Scrape"
+476,/m/07pdjhy,"Rub"
+477,/m/07s8j8t,"Roll"
+478,/m/07plct2,"Crushing"
+479,/t/dd00112,"Crumpling, crinkling"
+480,/m/07qcx4z,"Tearing"
+481,/m/02fs_r,"Beep, bleep"
+482,/m/07qwdck,"Ping"
+483,/m/07phxs1,"Ding"
+484,/m/07rv4dm,"Clang"
+485,/m/07s02z0,"Squeal"
+486,/m/07qh7jl,"Creak"
+487,/m/07qwyj0,"Rustle"
+488,/m/07s34ls,"Whir"
+489,/m/07qmpdm,"Clatter"
+490,/m/07p9k1k,"Sizzle"
+491,/m/07qc9xj,"Clicking"
+492,/m/07rwm0c,"Clickety-clack"
+493,/m/07phhsh,"Rumble"
+494,/m/07qyrcz,"Plop"
+495,/m/07qfgpx,"Jingle, tinkle"
+496,/m/07rcgpl,"Hum"
+497,/m/07p78v5,"Zing"
+498,/t/dd00121,"Boing"
+499,/m/07s12q4,"Crunch"
+500,/m/028v0c,"Silence"
+501,/m/01v_m0,"Sine wave"
+502,/m/0b9m1,"Harmonic"
+503,/m/0hdsk,"Chirp tone"
+504,/m/0c1dj,"Sound effect"
+505,/m/07pt_g0,"Pulse"
+506,/t/dd00125,"Inside, small room"
+507,/t/dd00126,"Inside, large room or hall"
+508,/t/dd00127,"Inside, public space"
+509,/t/dd00128,"Outside, urban or manmade"
+510,/t/dd00129,"Outside, rural or natural"
+511,/m/01b9nn,"Reverberation"
+512,/m/01jnbd,"Echo"
+513,/m/096m7z,"Noise"
+514,/m/06_y0by,"Environmental noise"
+515,/m/07rgkc5,"Static"
+516,/m/06xkwv,"Mains hum"
+517,/m/0g12c5,"Distortion"
+518,/m/08p9q4,"Sidetone"
+519,/m/07szfh9,"Cacophony"
+520,/m/0chx_,"White noise"
+521,/m/0cj0r,"Pink noise"
+522,/m/07p_0gm,"Throbbing"
+523,/m/01jwx6,"Vibration"
+524,/m/07c52,"Television"
+525,/m/06bz3,"Radio"
+526,/m/07hvw1,"Field recording"

efficientat/models/MobileNetV3.py ADDED Viewed

	@@ -0,0 +1,349 @@

+from functools import partial
+from typing import Any, Callable, List, Optional, Sequence, Tuple
+from torch import nn, Tensor
+import torch.nn.functional as F
+from torchvision.ops.misc import ConvNormActivation
+from torch.hub import load_state_dict_from_url
+import urllib.parse
+from efficientat.models.utils import cnn_out_size
+from efficientat.models.block_types import InvertedResidualConfig, InvertedResidual
+from efficientat.models.attention_pooling import MultiHeadAttentionPooling
+from efficientat.helpers.utils import NAME_TO_WIDTH
+# Adapted version of MobileNetV3 pytorch implementation
+# https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py
+# points to github releases
+model_url = "https://github.com/fschmid56/EfficientAT/releases/download/v0.0.1/"
+# folder to store downloaded models to
+model_dir = "resources"
+pretrained_models = {
+    # pytorch ImageNet pre-trained model
+    # own ImageNet pre-trained models will follow
+    # NOTE: for easy loading we provide the adapted state dict ready for AudioSet training (1 input channel,
+    # 527 output classes)
+    # NOTE: the classifier is just a random initialization, feature extractor (conv layers) is pre-trained
+    "mn10_im_pytorch": urllib.parse.urljoin(model_url, "mn10_im_pytorch.pt"),
+    # Models trained on AudioSet
+    "mn04_as": urllib.parse.urljoin(model_url, "mn04_as_mAP_432.pt"),
+    "mn05_as": urllib.parse.urljoin(model_url, "mn05_as_mAP_443.pt"),
+    "mn10_as": urllib.parse.urljoin(model_url, "mn10_as_mAP_471.pt"),
+    "mn20_as": urllib.parse.urljoin(model_url, "mn20_as_mAP_478.pt"),
+    "mn30_as": urllib.parse.urljoin(model_url, "mn30_as_mAP_482.pt"),
+    "mn40_as": urllib.parse.urljoin(model_url, "mn40_as_mAP_484.pt"),
+    "mn40_as(2)": urllib.parse.urljoin(model_url, "mn40_as_mAP_483.pt"),
+    "mn40_as(3)": urllib.parse.urljoin(model_url, "mn40_as_mAP_483(2).pt"),
+    "mn40_as_no_im_pre": urllib.parse.urljoin(model_url, "mn40_as_no_im_pre_mAP_483.pt"),
+    "mn40_as_no_im_pre(2)": urllib.parse.urljoin(model_url, "mn40_as_no_im_pre_mAP_483(2).pt"),
+    "mn40_as_no_im_pre(3)": urllib.parse.urljoin(model_url, "mn40_as_no_im_pre_mAP_482.pt"),
+    "mn40_as_ext": urllib.parse.urljoin(model_url, "mn40_as_ext_mAP_487.pt"),
+    "mn40_as_ext(2)": urllib.parse.urljoin(model_url, "mn40_as_ext_mAP_486.pt"),
+    "mn40_as_ext(3)": urllib.parse.urljoin(model_url, "mn40_as_ext_mAP_485.pt"),
+    # varying hop size (time resolution)
+    "mn10_as_hop_15": urllib.parse.urljoin(model_url, "mn10_as_hop_15_mAP_463.pt"),
+    "mn10_as_hop_20": urllib.parse.urljoin(model_url, "mn10_as_hop_20_mAP_456.pt"),
+    "mn10_as_hop_25": urllib.parse.urljoin(model_url, "mn10_as_hop_25_mAP_447.pt"),
+    # varying n_mels (frequency resolution)
+    "mn10_as_mels_40": urllib.parse.urljoin(model_url, "mn10_as_mels_40_mAP_453.pt"),
+    "mn10_as_mels_64": urllib.parse.urljoin(model_url, "mn10_as_mels_64_mAP_461.pt"),
+    "mn10_as_mels_256": urllib.parse.urljoin(model_url, "mn10_as_mels_256_mAP_474.pt"),
+}
+class MobileNetV3(nn.Module):
+    def __init__(
+        self,
+        inverted_residual_setting: List[InvertedResidualConfig],
+        last_channel: int,
+        num_classes: int = 1000,
+        block: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        dropout: float = 0.2,
+        in_conv_kernel: int = 3,
+        in_conv_stride: int = 2,
+        in_channels: int = 1,
+        **kwargs: Any,
+    ) -> None:
+        """
+        MobileNet V3 main class
+        Args:
+            inverted_residual_setting (List[InvertedResidualConfig]): Network structure
+            last_channel (int): The number of channels on the penultimate layer
+            num_classes (int): Number of classes
+            block (Optional[Callable[..., nn.Module]]): Module specifying inverted residual building block for models
+            norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
+            dropout (float): The droupout probability
+            in_conv_kernel (int): Size of kernel for first convolution
+            in_conv_stride (int): Size of stride for first convolution
+            in_channels (int): Number of input channels
+        """
+        super(MobileNetV3, self).__init__()
+        if not inverted_residual_setting:
+            raise ValueError("The inverted_residual_setting should not be empty")
+        elif not (
+            isinstance(inverted_residual_setting, Sequence)
+            and all([isinstance(s, InvertedResidualConfig) for s in inverted_residual_setting])
+        ):
+            raise TypeError("The inverted_residual_setting should be List[InvertedResidualConfig]")
+        if block is None:
+            block = InvertedResidual
+        depthwise_norm_layer = norm_layer = \
+            norm_layer if norm_layer is not None else partial(nn.BatchNorm2d, eps=0.001, momentum=0.01)
+        layers: List[nn.Module] = []
+        kernel_sizes = [in_conv_kernel]
+        strides = [in_conv_stride]
+        # building first layer
+        firstconv_output_channels = inverted_residual_setting[0].input_channels
+        layers.append(
+            ConvNormActivation(
+                in_channels,
+                firstconv_output_channels,
+                kernel_size=in_conv_kernel,
+                stride=in_conv_stride,
+                norm_layer=norm_layer,
+                activation_layer=nn.Hardswish,
+            )
+        )
+        # get squeeze excitation config
+        se_cnf = kwargs.get('se_conf', None)
+        # building inverted residual blocks
+        # - keep track of size of frequency and time dimensions for possible application of Squeeze-and-Excitation
+        # on the frequency/time dimension
+        # - applying Squeeze-and-Excitation on the time dimension is not recommended as this constrains the network to
+        # a particular length of the audio clip, whereas Squeeze-and-Excitation on the frequency bands is fine,
+        # as the number of frequency bands is usually not changing
+        f_dim, t_dim = kwargs.get('input_dims', (128, 1000))
+        # take into account first conv layer
+        f_dim = cnn_out_size(f_dim, 1, 1, 3, 2)
+        t_dim = cnn_out_size(t_dim, 1, 1, 3, 2)
+        for cnf in inverted_residual_setting:
+            f_dim = cnf.out_size(f_dim)
+            t_dim = cnf.out_size(t_dim)
+            cnf.f_dim, cnf.t_dim = f_dim, t_dim  # update dimensions in block config
+            layers.append(block(cnf, se_cnf, norm_layer, depthwise_norm_layer))
+            kernel_sizes.append(cnf.kernel)
+            strides.append(cnf.stride)
+        # building last several layers
+        lastconv_input_channels = inverted_residual_setting[-1].out_channels
+        lastconv_output_channels = 6 * lastconv_input_channels
+        layers.append(
+            ConvNormActivation(
+                lastconv_input_channels,
+                lastconv_output_channels,
+                kernel_size=1,
+                norm_layer=norm_layer,
+                activation_layer=nn.Hardswish,
+            )
+        )
+        self.features = nn.Sequential(*layers)
+        self.head_type = kwargs.get("head_type", False)
+        if self.head_type == "multihead_attention_pooling":
+            self.classifier = MultiHeadAttentionPooling(lastconv_output_channels, num_classes,
+                                                        num_heads=kwargs.get("multihead_attention_heads"))
+        elif self.head_type == "fully_convolutional":
+            self.classifier = nn.Sequential(
+                nn.Conv2d(
+                    lastconv_output_channels,
+                    num_classes,
+                    kernel_size=(1, 1),
+                    stride=(1, 1),
+                    padding=(0, 0),
+                    bias=False),
+                nn.BatchNorm2d(num_classes),
+                nn.AdaptiveAvgPool2d((1, 1)),
+            )
+        elif self.head_type == "mlp":
+            self.classifier = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                nn.Flatten(start_dim=1),
+                nn.Linear(lastconv_output_channels, last_channel),
+                nn.Hardswish(inplace=True),
+                nn.Dropout(p=dropout, inplace=True),
+                nn.Linear(last_channel, num_classes),
+            )
+        else:
+            raise NotImplementedError(f"Head '{self.head_type}' unknown. Must be one of: 'mlp', "
+                                      f"'fully_convolutional', 'multihead_attention_pooling'")
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm, nn.LayerNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+    def _forward_impl(self, x: Tensor) -> (Tensor, Tensor):
+        x = self.features(x)
+        features = F.adaptive_avg_pool2d(x, (1, 1)).squeeze()
+        x = self.classifier(x).squeeze()
+        if features.dim() == 1 and x.dim() == 1:
+            # squeezed batch dimension
+            features = features.unsqueeze(0)
+            x = x.unsqueeze(0)
+        return x, features
+    def forward(self, x: Tensor) -> (Tensor, Tensor):
+        return self._forward_impl(x)
+def _mobilenet_v3_conf(
+        width_mult: float = 1.0,
+        reduced_tail: bool = False,
+        dilated: bool = False,
+        c4_stride: int = 2,
+        **kwargs: Any
+):
+    reduce_divider = 2 if reduced_tail else 1
+    dilation = 2 if dilated else 1
+    bneck_conf = partial(InvertedResidualConfig, width_mult=width_mult)
+    adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_mult=width_mult)
+    # InvertedResidualConfig:
+    # input_channels, kernel, expanded_channels, out_channels, use_se, activation, stride, dilation, width_mult
+    inverted_residual_setting = [
+        bneck_conf(16, 3, 16, 16, False, "RE", 1, 1),
+        bneck_conf(16, 3, 64, 24, False, "RE", 2, 1),  # C1
+        bneck_conf(24, 3, 72, 24, False, "RE", 1, 1),
+        bneck_conf(24, 5, 72, 40, True, "RE", 2, 1),  # C2
+        bneck_conf(40, 5, 120, 40, True, "RE", 1, 1),
+        bneck_conf(40, 5, 120, 40, True, "RE", 1, 1),
+        bneck_conf(40, 3, 240, 80, False, "HS", 2, 1),  # C3
+        bneck_conf(80, 3, 200, 80, False, "HS", 1, 1),
+        bneck_conf(80, 3, 184, 80, False, "HS", 1, 1),
+        bneck_conf(80, 3, 184, 80, False, "HS", 1, 1),
+        bneck_conf(80, 3, 480, 112, True, "HS", 1, 1),
+        bneck_conf(112, 3, 672, 112, True, "HS", 1, 1),
+        bneck_conf(112, 5, 672, 160 // reduce_divider, True, "HS", c4_stride, dilation),  # C4
+        bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation),
+        bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation),
+    ]
+    last_channel = adjust_channels(1280 // reduce_divider)
+    return inverted_residual_setting, last_channel
+def _mobilenet_v3(
+    inverted_residual_setting: List[InvertedResidualConfig],
+    last_channel: int,
+    pretrained_name: str,
+    **kwargs: Any,
+):
+    model = MobileNetV3(inverted_residual_setting, last_channel, **kwargs)
+    if pretrained_name in pretrained_models:
+        model_url = pretrained_models.get(pretrained_name)
+        state_dict = load_state_dict_from_url(model_url, model_dir=model_dir, map_location="cpu")
+        if kwargs['num_classes'] != state_dict['classifier.5.bias'].size(0):
+            # if the number of logits is not matching the state dict,
+            # drop the corresponding pre-trained part
+            print(f"Number of classes defined: {kwargs['num_classes']}, "
+                  f"but try to load pre-trained layer with logits: {state_dict['classifier.5.bias'].size(0)}\n"
+                  "Dropping last layer.")
+            del state_dict['classifier.5.weight']
+            del state_dict['classifier.5.bias']
+        try:
+            model.load_state_dict(state_dict)
+        except RuntimeError as e:
+            print(str(e))
+            print("Loading weights pre-trained weights in a non-strict manner.")
+            model.load_state_dict(state_dict, strict=False)
+    elif pretrained_name:
+        raise NotImplementedError(f"Model name '{pretrained_name}' unknown.")
+    return model
+def mobilenet_v3(pretrained_name: str = None, **kwargs: Any) \
+        -> MobileNetV3:
+    """
+    Constructs a MobileNetV3 architecture from
+    "Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>".
+    """
+    inverted_residual_setting, last_channel = _mobilenet_v3_conf(**kwargs)
+    return _mobilenet_v3(inverted_residual_setting, last_channel, pretrained_name, **kwargs)
+def get_model(num_classes: int = 527, pretrained_name: str = None, width_mult: float = 1.0,
+              reduced_tail: bool = False, dilated: bool = False, c4_stride: int = 2, head_type: str = "mlp",
+              multihead_attention_heads: int = 4, input_dim_f: int = 128,
+              input_dim_t: int = 1000, se_dims: str = 'c', se_agg: str = "max", se_r: int = 4):
+    """
+        Arguments to modify the instantiation of a MobileNetv3
+        Args:
+            num_classes (int): Specifies number of classes to predict
+            pretrained_name (str): Specifies name of pre-trained model to load
+            width_mult (float): Scales width of network
+            reduced_tail (bool): Scales down network tail
+            dilated (bool): Applies dilated convolution to network tail
+            c4_stride (int): Set to '2' in original implementation;
+                might be changed to modify the size of receptive field
+            head_type (str): decides which classification head to use
+            multihead_attention_heads (int): number of heads in case 'multihead_attention_heads' is used
+            input_dim_f (int): number of frequency bands
+            input_dim_t (int): number of time frames
+            se_dims (Tuple): choose dimension to apply squeeze-excitation on, if multiple dimensions are chosen, then
+                squeeze-excitation is applied concurrently and se layer outputs are fused by se_agg operation
+            se_agg (str): operation to fuse output of concurrent se layers
+            se_r (int): squeeze excitation bottleneck size
+            se_dims (str): contains letters corresponding to dimensions 'c' - channel, 'f' - frequency, 't' - time
+        """
+    dim_map = {'c': 1, 'f': 2, 't': 3}
+    assert len(se_dims) <= 3 and all([s in dim_map.keys() for s in se_dims]) or se_dims == 'none'
+    input_dims = (input_dim_f, input_dim_t)
+    if se_dims == 'none':
+        se_dims = None
+    else:
+        se_dims = [dim_map[s] for s in se_dims]
+    se_conf = dict(se_dims=se_dims, se_agg=se_agg, se_r=se_r)
+    m = mobilenet_v3(pretrained_name=pretrained_name, num_classes=num_classes,
+                     width_mult=width_mult, reduced_tail=reduced_tail, dilated=dilated, c4_stride=c4_stride,
+                     head_type=head_type, multihead_attention_heads=multihead_attention_heads,
+                     input_dims=input_dims, se_conf=se_conf
+                     )
+    print(m)
+    return m
+class EnsemblerModel(nn.Module):
+    def __init__(self, model_names):
+        super(EnsemblerModel, self).__init__()
+        self.models = nn.ModuleList([get_model(width_mult=NAME_TO_WIDTH(model_name), pretrained_name=model_name)
+                                     for model_name in model_names])
+    def forward(self, x):
+        all_out = None
+        for m in self.models:
+            out, _ = m(x)
+            if all_out is None:
+                all_out = out
+            else:
+                all_out = out + all_out
+        all_out = all_out / len(self.models)
+        return all_out, all_out
+def get_ensemble_model(model_names):
+    return EnsemblerModel(model_names)

efficientat/models/attention_pooling.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from efficientat.models.utils import collapse_dim
+class MultiHeadAttentionPooling(nn.Module):
+    """Multi-Head Attention as used in PSLA paper (https://arxiv.org/pdf/2102.01243.pdf)
+    """
+    def __init__(self, in_dim, out_dim, att_activation: str = 'sigmoid',
+                 clf_activation: str = 'ident', num_heads: int = 4, epsilon: float = 1e-7):
+        super(MultiHeadAttentionPooling, self).__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.epsilon = epsilon
+        self.att_activation = att_activation
+        self.clf_activation = clf_activation
+        # out size: out dim x 2 (att and clf paths) x num_heads
+        self.subspace_proj = nn.Linear(self.in_dim, self.out_dim * 2 * self.num_heads)
+        self.head_weight = nn.Parameter(torch.tensor([1.0 / self.num_heads] * self.num_heads).view(1, -1, 1))
+    def activate(self, x, activation):
+        if activation == 'linear':
+            return x
+        elif activation == 'relu':
+            return F.relu(x)
+        elif activation == 'sigmoid':
+            return torch.sigmoid(x)
+        elif activation == 'softmax':
+            return F.softmax(x, dim=1)
+        elif activation == 'ident':
+            return x
+    def forward(self, x) -> Tensor:
+        """x: Tensor of size (batch_size, channels, frequency bands, sequence length)
+        """
+        x = collapse_dim(x, dim=2)  # results in tensor of size (batch_size, channels, sequence_length)
+        x = x.transpose(1, 2)  # results in tensor of size (batch_size, sequence_length, channels)
+        b, n, c = x.shape
+        x = self.subspace_proj(x).reshape(b, n, 2, self.num_heads, self.out_dim).permute(2, 0, 3, 1, 4)
+        att, val = x[0], x[1]
+        val = self.activate(val, self.clf_activation)
+        att = self.activate(att, self.att_activation)
+        att = torch.clamp(att, self.epsilon, 1. - self.epsilon)
+        att = att / torch.sum(att, dim=2, keepdim=True)
+        out = torch.sum(att * val, dim=2) * self.head_weight
+        out = torch.sum(out, dim=1)
+        return out

efficientat/models/block_types.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from typing import Dict, Callable, List
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torchvision.ops.misc import ConvNormActivation
+from efficientat.models.utils import make_divisible, cnn_out_size
+class ConcurrentSEBlock(torch.nn.Module):
+    def __init__(
+        self,
+        c_dim: int,
+        f_dim: int,
+        t_dim: int,
+        se_cnf: Dict
+    ) -> None:
+        super().__init__()
+        dims = [c_dim, f_dim, t_dim]
+        self.conc_se_layers = nn.ModuleList()
+        for d in se_cnf['se_dims']:
+            input_dim = dims[d-1]
+            squeeze_dim = make_divisible(input_dim // se_cnf['se_r'], 8)
+            self.conc_se_layers.append(SqueezeExcitation(input_dim, squeeze_dim, d))
+        if se_cnf['se_agg'] == "max":
+            self.agg_op = lambda x: torch.max(x, dim=0)[0]
+        elif se_cnf['se_agg'] == "avg":
+            self.agg_op = lambda x: torch.mean(x, dim=0)
+        elif se_cnf['se_agg'] == "add":
+            self.agg_op = lambda x: torch.sum(x, dim=0)
+        elif se_cnf['se_agg'] == "min":
+            self.agg_op = lambda x: torch.min(x, dim=0)[0]
+        else:
+            raise NotImplementedError(f"SE aggregation operation '{self.agg_op}' not implemented")
+    def forward(self, input: Tensor) -> Tensor:
+        # apply all concurrent se layers
+        se_outs = []
+        for se_layer in self.conc_se_layers:
+            se_outs.append(se_layer(input))
+        out = self.agg_op(torch.stack(se_outs, dim=0))
+        return out
+class SqueezeExcitation(torch.nn.Module):
+    """
+    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507.
+    Args:
+        input_dim (int): Input dimension
+        squeeze_dim (int): Size of Bottleneck
+        activation (Callable): activation applied to bottleneck
+        scale_activation (Callable): activation applied to the output
+    """
+    def __init__(
+        self,
+        input_dim: int,
+        squeeze_dim: int,
+        se_dim: int,
+        activation: Callable[..., torch.nn.Module] = torch.nn.ReLU,
+        scale_activation: Callable[..., torch.nn.Module] = torch.nn.Sigmoid,
+    ) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(input_dim, squeeze_dim)
+        self.fc2 = torch.nn.Linear(squeeze_dim, input_dim)
+        assert se_dim in [1, 2, 3]
+        self.se_dim = [1, 2, 3]
+        self.se_dim.remove(se_dim)
+        self.activation = activation()
+        self.scale_activation = scale_activation()
+    def _scale(self, input: Tensor) -> Tensor:
+        scale = torch.mean(input, self.se_dim, keepdim=True)
+        shape = scale.size()
+        scale = self.fc1(scale.squeeze(2).squeeze(2))
+        scale = self.activation(scale)
+        scale = self.fc2(scale)
+        scale = scale
+        return self.scale_activation(scale).view(shape)
+    def forward(self, input: Tensor) -> Tensor:
+        scale = self._scale(input)
+        return scale * input
+class InvertedResidualConfig:
+    # Stores information listed at Tables 1 and 2 of the MobileNetV3 paper
+    def __init__(
+        self,
+        input_channels: int,
+        kernel: int,
+        expanded_channels: int,
+        out_channels: int,
+        use_se: bool,
+        activation: str,
+        stride: int,
+        dilation: int,
+        width_mult: float,
+    ):
+        self.input_channels = self.adjust_channels(input_channels, width_mult)
+        self.kernel = kernel
+        self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
+        self.out_channels = self.adjust_channels(out_channels, width_mult)
+        self.use_se = use_se
+        self.use_hs = activation == "HS"
+        self.stride = stride
+        self.dilation = dilation
+        self.f_dim = None
+        self.t_dim = None
+    @staticmethod
+    def adjust_channels(channels: int, width_mult: float):
+        return make_divisible(channels * width_mult, 8)
+    def out_size(self, in_size):
+        padding = (self.kernel - 1) // 2 * self.dilation
+        return cnn_out_size(in_size, padding, self.dilation, self.kernel, self.stride)
+class InvertedResidual(nn.Module):
+    def __init__(
+        self,
+        cnf: InvertedResidualConfig,
+        se_cnf: Dict,
+        norm_layer: Callable[..., nn.Module],
+        depthwise_norm_layer: Callable[..., nn.Module]
+    ):
+        super().__init__()
+        if not (1 <= cnf.stride <= 2):
+            raise ValueError("illegal stride value")
+        self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+        layers: List[nn.Module] = []
+        activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU
+        # expand
+        if cnf.expanded_channels != cnf.input_channels:
+            layers.append(
+                ConvNormActivation(
+                    cnf.input_channels,
+                    cnf.expanded_channels,
+                    kernel_size=1,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+        # depthwise
+        stride = 1 if cnf.dilation > 1 else cnf.stride
+        layers.append(
+            ConvNormActivation(
+                cnf.expanded_channels,
+                cnf.expanded_channels,
+                kernel_size=cnf.kernel,
+                stride=stride,
+                dilation=cnf.dilation,
+                groups=cnf.expanded_channels,
+                norm_layer=depthwise_norm_layer,
+                activation_layer=activation_layer,
+            )
+        )
+        if cnf.use_se and se_cnf['se_dims'] is not None:
+            layers.append(ConcurrentSEBlock(cnf.expanded_channels, cnf.f_dim, cnf.t_dim, se_cnf))
+        # project
+        layers.append(
+            ConvNormActivation(
+                cnf.expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
+            )
+        )
+        self.block = nn.Sequential(*layers)
+        self.out_channels = cnf.out_channels
+        self._is_cn = cnf.stride > 1
+    def forward(self, inp: Tensor) -> Tensor:
+        result = self.block(inp)
+        if self.use_res_connect:
+            result += inp
+        return result

efficientat/models/preprocess.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch.nn as nn
+import torchaudio
+import torch
+class AugmentMelSTFT(nn.Module):
+    def __init__(self, n_mels=128, sr=32000, win_length=800, hopsize=320, n_fft=1024, freqm=48, timem=192,
+                 fmin=0.0, fmax=None, fmin_aug_range=10, fmax_aug_range=2000):
+        torch.nn.Module.__init__(self)
+        # adapted from: https://github.com/CPJKU/kagglebirds2020/commit/70f8308b39011b09d41eb0f4ace5aa7d2b0e806e
+        self.win_length = win_length
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.sr = sr
+        self.fmin = fmin
+        if fmax is None:
+            fmax = sr // 2 - fmax_aug_range // 2
+            print(f"Warning: FMAX is None setting to {fmax} ")
+        self.fmax = fmax
+        self.hopsize = hopsize
+        self.register_buffer('window',
+                             torch.hann_window(win_length, periodic=False),
+                             persistent=False)
+        assert fmin_aug_range >= 1, f"fmin_aug_range={fmin_aug_range} should be >=1; 1 means no augmentation"
+        assert fmax_aug_range >= 1, f"fmax_aug_range={fmax_aug_range} should be >=1; 1 means no augmentation"
+        self.fmin_aug_range = fmin_aug_range
+        self.fmax_aug_range = fmax_aug_range
+        self.register_buffer("preemphasis_coefficient", torch.as_tensor([[[-.97, 1]]]), persistent=False)
+        if freqm == 0:
+            self.freqm = torch.nn.Identity()
+        else:
+            self.freqm = torchaudio.transforms.FrequencyMasking(freqm, iid_masks=True)
+        if timem == 0:
+            self.timem = torch.nn.Identity()
+        else:
+            self.timem = torchaudio.transforms.TimeMasking(timem, iid_masks=True)
+    def forward(self, x):
+        x = nn.functional.conv1d(x.unsqueeze(1), self.preemphasis_coefficient).squeeze(1)
+        x = torch.stft(x, self.n_fft, hop_length=self.hopsize, win_length=self.win_length,
+                       center=True, normalized=False, window=self.window, return_complex=False)
+        x = (x ** 2).sum(dim=-1)  # power mag
+        fmin = self.fmin + torch.randint(self.fmin_aug_range, (1,)).item()
+        fmax = self.fmax + self.fmax_aug_range // 2 - torch.randint(self.fmax_aug_range, (1,)).item()
+        # don't augment eval data
+        if not self.training:
+            fmin = self.fmin
+            fmax = self.fmax
+        mel_basis, _ = torchaudio.compliance.kaldi.get_mel_banks(self.n_mels,  self.n_fft, self.sr,
+                                        fmin, fmax, vtln_low=100.0, vtln_high=-500., vtln_warp_factor=1.0)
+        mel_basis = torch.as_tensor(torch.nn.functional.pad(mel_basis, (0, 1), mode='constant', value=0),
+                                    device=x.device)
+        with torch.cuda.amp.autocast(enabled=False):
+            melspec = torch.matmul(mel_basis, x)
+        melspec = (melspec + 0.00001).log()
+        if self.training:
+            melspec = self.freqm(melspec)
+            melspec = self.timem(melspec)
+        melspec = (melspec + 4.5) / 5.  # fast normalization
+        return melspec

efficientat/models/utils.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import math
+from typing import Optional, Callable
+import torch
+import torch.nn as nn
+from torch import Tensor
+def make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+def cnn_out_size(in_size, padding, dilation, kernel, stride):
+    s = in_size + 2 * padding - dilation * (kernel - 1) - 1
+    return math.floor(s / stride + 1)
+def collapse_dim(x: Tensor, dim: int, mode: str = "pool", pool_fn:  Callable[[Tensor, int], Tensor] = torch.mean,
+                 combine_dim: int = None):
+    """
+    Collapses dimension of multi-dimensional tensor by pooling or combining dimensions
+    :param x: input Tensor
+    :param dim: dimension to collapse
+    :param mode: 'pool' or 'combine'
+    :param pool_fn: function to be applied in case of pooling
+    :param combine_dim: dimension to join 'dim' to
+    :return: collapsed tensor
+    """
+    if mode == "pool":
+        return pool_fn(x, dim)
+    elif mode == "combine":
+        s = list(x.size())
+        s[combine_dim] *= dim
+        s[dim] //= dim
+        return x.view(s)
+class CollapseDim(nn.Module):
+    def __init__(self, dim: int, mode: str = "pool", pool_fn:  Callable[[Tensor, int], Tensor] = torch.mean,
+                 combine_dim: int = None):
+        super(CollapseDim, self).__init__()
+        self.dim = dim
+        self.mode = mode
+        self.pool_fn = pool_fn
+        self.combine_dim = combine_dim
+    def forward(self, x):
+        return collapse_dim(x, dim=self.dim, mode=self.mode, pool_fn=self.pool_fn, combine_dim=self.combine_dim)

efficientat/resources/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Download the latest version from this repo's Github Releases and place them inside this folder.

efficientat/resources/metro_station-paris.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75d28a33f45fd6eebd862bb25a3738dd83b7aa92ad64c26d5b1879ff2a715b3f
+size 1323044

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ libsndfile1

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+av==10.0.0
+h5py==3.7.0
+librosa=0.9.2
+torch