Spaces:
Sleeping
Sleeping
import gradio as gr | |
#import os | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import librosa | |
from panns_inference import SoundEventDetection, labels | |
def plot_sound_event_detection_result(framewise_output): | |
"""Visualization of sound event detection result. | |
Args: | |
framewise_output: (time_steps, classes_num) | |
""" | |
out_fig_path = 'sed.png' | |
#os.makedirs(os.path.dirname(out_fig_path), exist_ok=True) | |
classwise_output = np.max(framewise_output, axis=0) # (classes_num,) | |
idxes = np.argsort(classwise_output)[::-1] | |
idxes = idxes[0:5] | |
ix_to_lb = {i : label for i, label in enumerate(labels)} | |
lines = [] | |
for idx in idxes: | |
line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx]) | |
lines.append(line) | |
plt.legend(handles=lines) | |
plt.xlabel('Frames') | |
plt.ylabel('Probability') | |
plt.ylim(0, 1.) | |
plt.savefig(out_fig_path) | |
plt.close() | |
print('Save fig to {}'.format(out_fig_path)) | |
# modified | |
return plt.imread(out_fig_path) | |
def pred(audio): | |
rate, y = audio | |
device = 'cpu' # 'cuda' | 'cpu' | |
#print('sample rate ', rate) | |
#print('shape ', y.shape) | |
#print('raw data', y) | |
y = y.astype(np.float32) | |
#print('float', y) | |
y = librosa.core.to_mono(y.T) | |
#print('shape ', y.shape) | |
#print('mono', y) | |
y = librosa.core.resample(y, orig_sr=rate, target_sr=32000) | |
#print('shape ', y.shape) | |
#print('resampled', y) | |
#print(y.mean()) | |
#print(y.std()) | |
#y = (y - y.mean())/y.std() | |
y = y/y.max() | |
#print('normalized', y) | |
#print(rate) | |
#plt.plot(y) | |
#plt.savefig('wave.png') | |
#plt.close() | |
y = y[None, :] # (batch_size, segment_samples) | |
#print(y) | |
#print('------ Audio tagging ------') | |
#at = AudioTagging(checkpoint_path=None, device=device) | |
#(clipwise_output, embedding) = at.inference(waveform) | |
#"""clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)""" | |
#print_audio_tagging_result(clipwise_output[0]) | |
print('------ Sound event detection ------') | |
sed = SoundEventDetection(checkpoint_path=None, device=device) | |
framewise_output = sed.inference(y) | |
"""(batch_size, time_steps, classes_num)""" | |
# modified | |
return plot_sound_event_detection_result(framewise_output[0]) | |
demo = gr.Interface( | |
pred, | |
gr.Audio(source="upload"), | |
"image", | |
examples=[ | |
"telephone_speech.wav", | |
"ringtone.wav", "animals.wav", | |
], | |
title="Sound Event Detection", | |
description="This is a demo huggingface space app for PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition. Please view README for more details.", | |
) | |
demo.launch() |