khalidsaifullaah commited on
Commit
462d118
1 Parent(s): c839698

Add application file

Browse files
Files changed (6) hide show
  1. .gitattributes +2 -0
  2. app.py +46 -0
  3. model.ckpt +3 -0
  4. model.py +167 -0
  5. requirements.txt +0 -0
  6. sample_audio.wav +3 -0
.gitattributes CHANGED
@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
29
+ *.wav filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torchaudio
2
+ import torch
3
+ from model import M11
4
+ import gradio as gr
5
+
6
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
7
+ model_PATH = "./model.ckpt"
8
+
9
+ classifier = M11.load_from_checkpoint(model_PATH)
10
+ classifier.eval()
11
+
12
+ def preprocess(signal, sr, device):
13
+ # resampling the audio signal with the training sample rate
14
+ if sr != 8_000:
15
+ resampler = torchaudio.transforms.Resample(sr, 8_000).to(device)
16
+ signal = resampler(signal)
17
+ # turning the stereo signals into mono
18
+ if signal.shape[0] > 1:
19
+ signal = torch.mean(signal, dim=0, keepdim=True)
20
+
21
+ return signal
22
+
23
+ def get_likely_index(tensor):
24
+ # find most likely label index for each element in the batch
25
+ return tensor.argmax(dim=-1)
26
+
27
+ def pipeline(input):
28
+ # print('gere')
29
+ # print(input)
30
+ sample_rate, audio = input
31
+ processed_audio = preprocess(torch.from_numpy(audio), sample_rate, DEVICE)
32
+
33
+ with torch.no_grad():
34
+ pred = get_likely_index(classifier(processed_audio.unsqueeze(0))).view(-1)
35
+ # out_prob, score, index, text_lab = classifier.classify_file(aud.name)
36
+ return pred[0]
37
+
38
+ inputs = gr.inputs.Audio(label="Input Audio", type="numpy")
39
+ outputs = "text"
40
+ title = "Threat Detection From Bengali Voice Calls"
41
+ description = "Gradio demo for Audio Classification, simply upload your audio, or click one of the examples to load them. Read more at the links below."
42
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2005.07143' target='_blank'>ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification</a> | <a href='https://github.com/speechbrain/speechbrain' target='_blank'>Github Repo</a></p>"
43
+ examples = [
44
+ ['sample_audio.wav']
45
+ ]
46
+ gr.Interface(pipeline, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()
model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abbf473f2a6445c3d4964c0fa0deb67a70d246149013d1bd42c7a64e60b9f8fe
3
+ size 23819567
model.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from torch.nn import functional as F
4
+ import torchaudio
5
+ import pytorch_lightning as pl
6
+ from torchmetrics import Accuracy, F1, Precision, Recall
7
+
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+
11
+ class M11(pl.LightningModule):
12
+ def __init__(self, hidden_units_1, hidden_units_2, dropout_1, dropout_2, n_input=1, n_output=3, stride=4, n_channel=64, lr=1e-3, l2=1e-5):
13
+ super().__init__()
14
+ self.save_hyperparameters()
15
+
16
+
17
+ self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
18
+ self.bn1 = nn.BatchNorm1d(n_channel)
19
+ self.pool1 = nn.MaxPool1d(4)
20
+
21
+ self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3,padding=1)
22
+ self.bn2 = nn.BatchNorm1d(n_channel)
23
+ self.conv3 = nn.Conv1d(n_channel, n_channel, kernel_size=3,padding=1)
24
+ self.bn3 = nn.BatchNorm1d(n_channel)
25
+ self.pool2 = nn.MaxPool1d(4)
26
+
27
+ self.conv4 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3,padding=1)
28
+ self.bn4 = nn.BatchNorm1d(2 * n_channel)
29
+ self.conv5 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3,padding=1)
30
+ self.bn5 = nn.BatchNorm1d(2 * n_channel)
31
+ self.pool3 = nn.MaxPool1d(4)
32
+
33
+ self.conv6 = nn.Conv1d(2 * n_channel, 4 * n_channel, kernel_size=3,padding=1)
34
+ self.bn6 = nn.BatchNorm1d(4 * n_channel)
35
+ self.conv7 = nn.Conv1d(4 * n_channel, 4 * n_channel, kernel_size=3,padding=1)
36
+ self.bn7 = nn.BatchNorm1d(4 * n_channel)
37
+ self.conv8 = nn.Conv1d(4 * n_channel, 4 * n_channel, kernel_size=3,padding=1)
38
+ self.bn8 = nn.BatchNorm1d(4 * n_channel)
39
+ self.pool4 = nn.MaxPool1d(4)
40
+
41
+ self.conv9 = nn.Conv1d(4 * n_channel, 8 * n_channel, kernel_size=3,padding=1)
42
+ self.bn9 = nn.BatchNorm1d(8 * n_channel)
43
+ self.conv10 = nn.Conv1d(8 * n_channel, 8 * n_channel, kernel_size=3,padding=1)
44
+ self.bn10 = nn.BatchNorm1d(8 * n_channel)
45
+
46
+ # self.fc1 = nn.Linear(8 * n_channel, n_output)
47
+ self.mlp = nn.Sequential(
48
+ nn.Linear(8 * n_channel, hidden_units_1),
49
+ nn.ReLU(),
50
+ nn.Dropout(dropout_1),
51
+ nn.Linear(hidden_units_1, hidden_units_2),
52
+ nn.ReLU(),
53
+ nn.Dropout(dropout_2),
54
+ nn.Linear(hidden_units_2, n_output)
55
+ )
56
+
57
+ def forward(self, x):
58
+ x = self.conv1(x)
59
+ x = F.relu(self.bn1(x))
60
+ x = self.pool1(x)
61
+
62
+ x = self.conv2(x)
63
+ x = F.relu(self.bn2(x))
64
+ x = self.conv3(x)
65
+ x = F.relu(self.bn3(x))
66
+ x = self.pool2(x)
67
+
68
+ x = self.conv4(x)
69
+ x = F.relu(self.bn4(x))
70
+ x = self.conv5(x)
71
+ x = F.relu(self.bn5(x))
72
+ x = self.pool3(x)
73
+
74
+ x = self.conv6(x)
75
+ x = F.relu(self.bn6(x))
76
+ x = self.conv7(x)
77
+ x = F.relu(self.bn7(x))
78
+ x = self.conv8(x)
79
+ x = F.relu(self.bn8(x))
80
+ x = self.pool4(x)
81
+
82
+ x = self.conv9(x)
83
+ x = F.relu(self.bn9(x))
84
+ x = self.conv10(x)
85
+ x = F.relu(self.bn10(x))
86
+
87
+ x = F.avg_pool1d(x, x.shape[-1])
88
+ x = x.permute(0, 2, 1)
89
+ # x = self.fc1(x)
90
+ x = self.mlp(x)
91
+ return F.log_softmax(x, dim=2)
92
+
93
+ def training_step(self, batch, batch_idx):
94
+ # Very simple training loop
95
+ data, target = batch
96
+ logits = self(data) # this calls self.forward
97
+ preds = torch.argmax(logits, dim=-1).squeeze()
98
+ # loss = cost(logits.squeeze(), target)
99
+ loss = unweighted_cost(logits.squeeze(), target)
100
+
101
+ f1 = f1_metric(preds, target)
102
+
103
+ self.log('train_loss', loss, on_epoch=True, prog_bar=True)
104
+ self.log('train_f1', f1, on_epoch=True, prog_bar=True)
105
+ return loss
106
+
107
+ def validation_step(self, batch, batch_idx):
108
+ data, target = batch
109
+ logits = self(data)
110
+ preds = torch.argmax(logits, dim=-1).squeeze()
111
+ # loss = val_cost(logits.squeeze(), target)
112
+ loss = unweighted_cost(logits.squeeze(), target)
113
+
114
+ acc = accuracy(preds, target)
115
+ f1 = f1_metric(preds, target)
116
+ prec = precision(preds, target)
117
+ rec = recall(preds, target)
118
+
119
+ self.log('val_loss', loss, on_epoch=True, prog_bar=True)
120
+ self.log('val_acc', acc, on_epoch=True, prog_bar=True)
121
+ self.log('val_f1', f1, on_epoch=True, prog_bar=True)
122
+ self.log('val_precision', prec, on_epoch=True, prog_bar=True)
123
+ self.log('val_recall', rec, on_epoch=True, prog_bar=True)
124
+ return loss, acc, f1, prec, rec
125
+
126
+ def configure_optimizers(self):
127
+ optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.l2)
128
+ return optimizer
129
+
130
+ # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
131
+ # model_PATH = "./model.ckpt"
132
+ # audio_PATH = "./sample_audio.wav"
133
+
134
+
135
+ # def _resample_if_necessary(signal, sr, device):
136
+ # if sr != 8_000:
137
+ # resampler = torchaudio.transforms.Resample(sr, 8_000).to(device)
138
+ # signal = resampler(signal)
139
+
140
+ # return signal
141
+
142
+ # def _mix_down_if_necessary(signal):
143
+ # if signal.shape[0] > 1:
144
+ # signal = torch.mean(signal, dim=0, keepdim=True)
145
+
146
+ # return signal
147
+
148
+ # def get_likely_index(tensor):
149
+ # # find most likely label index for each element in the batch
150
+ # return tensor.argmax(dim=-1)
151
+
152
+ # model = M11.load_from_checkpoint(model_PATH).to(DEVICE)
153
+ # model.eval()
154
+
155
+ # audio, sr = torchaudio.load(audio_PATH)
156
+ # # resampler = torchaudio.transforms.Resample(sr, 8_000).to(DEVICE)
157
+ # processed_audio = _mix_down_if_necessary(_resample_if_necessary(audio, sr, DEVICE))
158
+
159
+ # print(processed_audio.shape)
160
+ # with torch.no_grad():
161
+ # pred = get_likely_index(model(processed_audio.unsqueeze(0).to(DEVICE))).view(-1)
162
+
163
+ # # y_true = target.tolist()
164
+ # # y_pred = pred.tolist()
165
+ # # target_names = eval_dataset.label_list
166
+ # # print(classification_report(y_true, y_pred, target_names=target_names))
167
+ # print(pred)
requirements.txt ADDED
Binary file (346 Bytes). View file
 
sample_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:128ba5b5859c973da88c762ac33c9790b1e722e7ef84fbfc4af6fa22cd4ac4d9
3
+ size 10905126