|
|
|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
|
|
import torch |
|
|
|
|
|
|
|
|
|
|
|
import torch.nn as nn |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Conv1DNet(nn.Module): |
|
def __init__(self): |
|
super(Conv1DNet, self).__init__() |
|
|
|
|
|
self.conv1 = nn.Conv1d(in_channels=1, out_channels=8, kernel_size=13, stride=1, padding=1) |
|
self.relu1 = nn.ReLU() |
|
self.pool1 = nn.MaxPool1d(kernel_size=3) |
|
self.dropout1 = nn.Dropout(p=0.3) |
|
|
|
|
|
self.conv2 = nn.Conv1d(in_channels=8, out_channels=16, kernel_size=11, stride=1, padding=1) |
|
self.relu2 = nn.ReLU() |
|
self.pool2 = nn.MaxPool1d(kernel_size=3) |
|
self.dropout2 = nn.Dropout(p=0.3) |
|
|
|
|
|
self.conv3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=9, stride=1, padding=1) |
|
self.relu3 = nn.ReLU() |
|
self.pool3 = nn.MaxPool1d(kernel_size=3) |
|
self.dropout3 = nn.Dropout(p=0.3) |
|
|
|
self.conv4 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=7, stride=1, padding=1) |
|
self.relu4 = nn.ReLU() |
|
self.pool4 = nn.MaxPool1d(kernel_size=3) |
|
self.dropout4 = nn.Dropout(p=0.3) |
|
|
|
|
|
|
|
self.fc1 = nn.Linear(in_features=6144, out_features=10) |
|
|
|
|
|
def forward(self, x): |
|
|
|
batch_size=x.size(0) |
|
|
|
|
|
x = self.conv1(x) |
|
x = self.relu1(x) |
|
|
|
x = self.pool1(x) |
|
x = self.dropout1(x) |
|
x = self.conv2(x) |
|
x = self.relu2(x) |
|
x = self.pool2(x) |
|
x = self.dropout2(x) |
|
x = self.conv3(x) |
|
x = self.relu3(x) |
|
x = self.pool3(x) |
|
x = self.dropout3(x) |
|
x = self.conv4(x) |
|
x = self.relu4(x) |
|
x = self.pool4(x) |
|
x = self.dropout4(x) |
|
|
|
x = x.view(batch_size, -1) |
|
x = self.fc1(x) |
|
|
|
return x |
|
|
|
|
|
|
|
model = Conv1DNet() |
|
|
|
|
|
|
|
|
|
|
|
|
|
model.load_state_dict(torch.load('model1.pt',map_location=torch.device('cpu'))) |
|
|
|
|
|
|
|
|
|
|
|
model.eval() |
|
|
|
|
|
|
|
|
|
import librosa |
|
|
|
|
|
|
|
def asr(input): |
|
samples, sample_rate = librosa.load(input, sr = 8000) |
|
if len(samples) > 8000: |
|
samples = audio[:8000] |
|
if len(samples) < 8000: |
|
samples = np.pad(samples, (0, 8000 - len(samples)), mode='constant') |
|
x_tensor=torch.from_numpy(samples) |
|
inputs= x_tensor |
|
inputs=torch.unsqueeze(inputs,0) |
|
y=model(inputs) |
|
predicted=torch.argmax(y.data) |
|
labels=['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes'] |
|
return labels[predicted] |
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Interface( |
|
fn=asr, |
|
inputs=gr.inputs.Audio(source="upload", type="filepath"), |
|
outputs="text", |
|
|
|
|
|
description="asr ", |
|
theme="default", |
|
|
|
).launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|