Spaces:

alnalda
/

speech_command_classification_with_torchaudio

Runtime error

App Files Files Community

speech_command_classification_with_torchaudio / app.py

alnalda

Update app.py

9d31e5f over 2 years ago

raw

history blame contribute delete

No virus

4.22 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim
	import torchaudio
	import torchaudio.functional as Fu
	import sys

	import matplotlib.pyplot as plt
	import IPython.display as ipd

	from tqdm import tqdm
	import gradio as gr
	import ffmpeg
	import resampy

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	sample_rate = 48000
	new_sample_rate = 8000

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim
	import torchaudio
	import torchaudio.functional as Fu
	import sys

	import matplotlib.pyplot as plt
	import IPython.display as ipd

	from tqdm import tqdm
	import gradio as gr
	import ffmpeg
	import resampy

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	sample_rate = 48000
	new_sample_rate = 8000

	class M5(nn.Module):
	def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32):
	super().__init__()
	self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
	self.bn1 = nn.BatchNorm1d(n_channel)
	self.pool1 = nn.MaxPool1d(4)
	self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
	self.bn2 = nn.BatchNorm1d(n_channel)
	self.pool2 = nn.MaxPool1d(4)
	self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
	self.bn3 = nn.BatchNorm1d(2 * n_channel)
	self.pool3 = nn.MaxPool1d(4)
	self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
	self.bn4 = nn.BatchNorm1d(2 * n_channel)
	self.pool4 = nn.MaxPool1d(4)
	self.fc1 = nn.Linear(2 * n_channel, n_output)

	def forward(self, x):
	x = self.conv1(x)
	x = F.relu(self.bn1(x))
	x = self.pool1(x)
	x = self.conv2(x)
	x = F.relu(self.bn2(x))
	x = self.pool2(x)
	x = self.conv3(x)
	x = F.relu(self.bn3(x))
	x = self.pool3(x)
	x = self.conv4(x)
	x = F.relu(self.bn4(x))
	x = self.pool4(x)
	x = F.avg_pool1d(x, x.shape[-1])
	x = x.permute(0, 2, 1)
	x = self.fc1(x)
	return F.log_softmax(x, dim=2)

	def get_likely_index(tensor):
	# find most likely label index for each element in the batch
	return tensor.argmax(dim=-1)

	def index_to_label(index):
	# Return the word corresponding to the index in labels
	# This is the inverse of label_to_index
	return labels[index]

	def predict(filepath):
	tensor,sample_rate = torchaudio.load(filepath)
	transformchannels = torchaudio.transforms.MuLawEncoding(quantization_channels=1)
	tensor= transformchannels (tensor)
	tensor = tensor.to(device)
	tensor = tensor.type(torch.LongTensor)
	#transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
	#tensor = transform(tensor)
	tensor = model(tensor.unsqueeze(0))
	#tensor = get_likely_index(tensor)
	#tensor = index_to_label(tensor.squeeze())
	#return tensor
	return tensor

	model = torch.load('export.pkl',map_location=torch.device('cpu'))



	gr.Interface(fn=predict, inputs=gr.inputs.Audio(source='microphone',type='filepath'), outputs='text').launch(share=False)

	def get_likely_index(tensor):
	# find most likely label index for each element in the batch
	return tensor.argmax(dim=-1)

	def index_to_label(index):
	# Return the word corresponding to the index in labels
	# This is the inverse of label_to_index
	return labels[index]

	def predict(filepath):
	tensor,sample_rate = torchaudio.load(filepath)
	transformchannels = torchaudio.transforms.MuLawEncoding(quantization_channels=1)
	tensor= transformchannels (tensor)
	tensor = tensor.to(device)
	tensor = tensor.type(torch.LongTensor)
	transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
	tensor = transform(tensor)
	tensor = model(tensor.unsqueeze(0))
	tensor = get_likely_index(tensor)
	tensor = index_to_label(tensor.squeeze())
	return tensor

	model = torch.load('export.pkl',map_location=torch.device('cpu'))



	gr.Interface(fn=predict, inputs=gr.inputs.Audio(source='microphone',type='filepath'), outputs='text').launch(share=False)