wav_SED / app.py
pauldarvin's picture
Update app.py
23c74d4
import gradio as gr
#import os
import matplotlib.pyplot as plt
import numpy as np
import librosa
from panns_inference import SoundEventDetection, labels
def plot_sound_event_detection_result(framewise_output):
"""Visualization of sound event detection result.
Args:
framewise_output: (time_steps, classes_num)
"""
out_fig_path = 'sed.png'
#os.makedirs(os.path.dirname(out_fig_path), exist_ok=True)
classwise_output = np.max(framewise_output, axis=0) # (classes_num,)
idxes = np.argsort(classwise_output)[::-1]
idxes = idxes[0:5]
ix_to_lb = {i : label for i, label in enumerate(labels)}
lines = []
for idx in idxes:
line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx])
lines.append(line)
plt.legend(handles=lines)
plt.xlabel('Frames')
plt.ylabel('Probability')
plt.ylim(0, 1.)
plt.savefig(out_fig_path)
plt.close()
print('Save fig to {}'.format(out_fig_path))
# modified
return plt.imread(out_fig_path)
def pred(audio):
rate, y = audio
device = 'cpu' # 'cuda' | 'cpu'
#print('sample rate ', rate)
#print('shape ', y.shape)
#print('raw data', y)
y = y.astype(np.float32)
#print('float', y)
y = librosa.core.to_mono(y.T)
#print('shape ', y.shape)
#print('mono', y)
y = librosa.core.resample(y, orig_sr=rate, target_sr=32000)
#print('shape ', y.shape)
#print('resampled', y)
#print(y.mean())
#print(y.std())
#y = (y - y.mean())/y.std()
y = y/y.max()
#print('normalized', y)
#print(rate)
#plt.plot(y)
#plt.savefig('wave.png')
#plt.close()
y = y[None, :] # (batch_size, segment_samples)
#print(y)
#print('------ Audio tagging ------')
#at = AudioTagging(checkpoint_path=None, device=device)
#(clipwise_output, embedding) = at.inference(waveform)
#"""clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)"""
#print_audio_tagging_result(clipwise_output[0])
print('------ Sound event detection ------')
sed = SoundEventDetection(checkpoint_path=None, device=device)
framewise_output = sed.inference(y)
"""(batch_size, time_steps, classes_num)"""
# modified
return plot_sound_event_detection_result(framewise_output[0])
demo = gr.Interface(
pred,
gr.Audio(source="upload"),
"image",
examples=[
"telephone_speech.wav",
"ringtone.wav", "animals.wav",
],
title="Sound Event Detection",
description="This is a demo huggingface space app for PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition. Please view README for more details.",
)
demo.launch()