#!/usr/bin/env python # coding: utf-8 # In[1]: import gradio as gr import torch # In[11]: import torch.nn as nn # In[ ]: # In[ ]: # In[12]: class Conv1DNet(nn.Module): def __init__(self): super(Conv1DNet, self).__init__() # First Conv1D layer self.conv1 = nn.Conv1d(in_channels=1, out_channels=8, kernel_size=13, stride=1, padding=1) self.relu1 = nn.ReLU() self.pool1 = nn.MaxPool1d(kernel_size=3) self.dropout1 = nn.Dropout(p=0.3) # Second Conv1D layer self.conv2 = nn.Conv1d(in_channels=8, out_channels=16, kernel_size=11, stride=1, padding=1) self.relu2 = nn.ReLU() self.pool2 = nn.MaxPool1d(kernel_size=3) self.dropout2 = nn.Dropout(p=0.3) # Third Conv1D layer self.conv3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=9, stride=1, padding=1) self.relu3 = nn.ReLU() self.pool3 = nn.MaxPool1d(kernel_size=3) self.dropout3 = nn.Dropout(p=0.3) # forth Conv1D layer self.conv4 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=7, stride=1, padding=1) self.relu4 = nn.ReLU() self.pool4 = nn.MaxPool1d(kernel_size=3) self.dropout4 = nn.Dropout(p=0.3) # Fully connected layer self.fc1 = nn.Linear(in_features=6144, out_features=10) #self.fc2 = nn.Linear(in_features=256, out_features=10) def forward(self, x): # Pass the input through each layer batch_size=x.size(0) #print(x.size()) #print(x) x = self.conv1(x) x = self.relu1(x) #print(x.size()) x = self.pool1(x) x = self.dropout1(x) x = self.conv2(x) x = self.relu2(x) x = self.pool2(x) x = self.dropout2(x) x = self.conv3(x) x = self.relu3(x) x = self.pool3(x) x = self.dropout3(x) x = self.conv4(x) x = self.relu4(x) x = self.pool4(x) x = self.dropout4(x) #print(x.size(0)) x = x.view(batch_size, -1) x = self.fc1(x) #x = self.fc2(x) return x model = Conv1DNet() model.load_state_dict(torch.load('model1.pt',map_location=torch.device('cpu'))) # In[15]: model.eval() import librosa def asr(input): samples, sample_rate = librosa.load(input, sr = 8000) if len(samples) > 8000: samples = audio[:8000] if len(samples) < 8000: samples = np.pad(samples, (0, 8000 - len(samples)), mode='constant') x_tensor=torch.from_numpy(samples) inputs= x_tensor inputs=torch.unsqueeze(inputs,0) y=model(inputs) predicted=torch.argmax(y.data) labels=['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes'] return labels[predicted] gr.Interface( fn=asr, inputs=gr.inputs.Audio(source="upload", type="filepath"), outputs="text", description="asr ", theme="default", ).launch() # In[ ]: