vcxv / app.py
QDAE's picture
Upload 3 files
10258b4
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import gradio as gr
import torch
# In[11]:
import torch.nn as nn
# In[ ]:
# In[ ]:
# In[12]:
class Conv1DNet(nn.Module):
def __init__(self):
super(Conv1DNet, self).__init__()
# First Conv1D layer
self.conv1 = nn.Conv1d(in_channels=1, out_channels=8, kernel_size=13, stride=1, padding=1)
self.relu1 = nn.ReLU()
self.pool1 = nn.MaxPool1d(kernel_size=3)
self.dropout1 = nn.Dropout(p=0.3)
# Second Conv1D layer
self.conv2 = nn.Conv1d(in_channels=8, out_channels=16, kernel_size=11, stride=1, padding=1)
self.relu2 = nn.ReLU()
self.pool2 = nn.MaxPool1d(kernel_size=3)
self.dropout2 = nn.Dropout(p=0.3)
# Third Conv1D layer
self.conv3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=9, stride=1, padding=1)
self.relu3 = nn.ReLU()
self.pool3 = nn.MaxPool1d(kernel_size=3)
self.dropout3 = nn.Dropout(p=0.3)
# forth Conv1D layer
self.conv4 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=7, stride=1, padding=1)
self.relu4 = nn.ReLU()
self.pool4 = nn.MaxPool1d(kernel_size=3)
self.dropout4 = nn.Dropout(p=0.3)
# Fully connected layer
self.fc1 = nn.Linear(in_features=6144, out_features=10)
#self.fc2 = nn.Linear(in_features=256, out_features=10)
def forward(self, x):
# Pass the input through each layer
batch_size=x.size(0)
#print(x.size())
#print(x)
x = self.conv1(x)
x = self.relu1(x)
#print(x.size())
x = self.pool1(x)
x = self.dropout1(x)
x = self.conv2(x)
x = self.relu2(x)
x = self.pool2(x)
x = self.dropout2(x)
x = self.conv3(x)
x = self.relu3(x)
x = self.pool3(x)
x = self.dropout3(x)
x = self.conv4(x)
x = self.relu4(x)
x = self.pool4(x)
x = self.dropout4(x)
#print(x.size(0))
x = x.view(batch_size, -1)
x = self.fc1(x)
#x = self.fc2(x)
return x
model = Conv1DNet()
model.load_state_dict(torch.load('model1.pt',map_location=torch.device('cpu')))
# In[15]:
model.eval()
import librosa
def asr(input):
samples, sample_rate = librosa.load(input, sr = 8000)
if len(samples) > 8000:
samples = audio[:8000]
if len(samples) < 8000:
samples = np.pad(samples, (0, 8000 - len(samples)), mode='constant')
x_tensor=torch.from_numpy(samples)
inputs= x_tensor
inputs=torch.unsqueeze(inputs,0)
y=model(inputs)
predicted=torch.argmax(y.data)
labels=['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']
return labels[predicted]
gr.Interface(
fn=asr,
inputs=gr.inputs.Audio(source="upload", type="filepath"),
outputs="text",
description="asr ",
theme="default",
).launch()
# In[ ]: