yamnet / app.py
Luis
print readable
7884023
# https://tfhub.dev/google/lite-model/yamnet/classification/tflite/1
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
# import matplotlib.pyplot as plt
# from IPython.display import Audio
from scipy.io import wavfile
import scipy
# import soundfile as sf
# import audio2numpy as a2n
import os
import gradio as gr
# import audio2numpy
# import numpy as np
from pydub import AudioSegment
from matplotlib import pyplot as plt
# https://stackoverflow.com/questions/53633177/how-to-read-a-mp3-audio-file-into-a-numpy-array-save-a-numpy-array-to-mp3
# def read(f, normalized=False):
# """MP3 to numpy array"""
# a = pydub.AudioSegment.from_mp3(f)
# y = np.array(a.get_array_of_samples())
# if a.channels == 2:
# y = y.reshape((-1, 2))
# if normalized:
# return a.frame_rate, np.float32(y) / 2**15
# else:
# return a.frame_rate, y
#
#
# def write(f, sr, x, normalized=False):
# """numpy array to MP3"""
# channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
# if normalized: # normalized array - each item should be a float in [-1, 1)
# y = np.int16(x * 2 ** 15)
# else:
# y = np.int16(x)
# song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
# song.export(f, format="mp3", bitrate="320k")
# Load the model.
model = hub.load('https://tfhub.dev/google/yamnet/1')
debug = True
# Find the name of the class with the top score when mean-aggregated across frames.
def class_names_from_csv(class_map_csv_text):
"""Returns list of class names corresponding to score vector."""
class_names = []
with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
class_names.append(row['display_name'])
return class_names
class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)
def ensure_sample_rate(original_sample_rate, waveform,
desired_sample_rate=16000):
"""Resample waveform if required."""
if original_sample_rate != desired_sample_rate:
desired_length = int(round(float(len(waveform)) /
original_sample_rate * desired_sample_rate))
waveform = scipy.signal.resample(waveform, desired_length)
return desired_sample_rate, waveform
os.system("wget https://storage.googleapis.com/audioset/miaow_16k.wav")
def inference(audio):
# wav_file_name = 'speech_whistling2.wav'
wav_file_name = audio
if debug: print(f'read, wav_file_name: {wav_file_name}')
if wav_file_name.endswith('.mp3'):
# files
new_wav = convMp3ToWav(wav_file_name)
os.remove(wav_file_name)
wav_file_name = new_wav
if debug: print(f'covMp3ToWav, wav_file_name: {wav_file_name}')
sample_rate, wav_data = wavfile.read(wav_file_name, 'rb')
if debug: print(f'read, wav_data: {wav_data}')
if debug: print(f'read, sample_rate: {sample_rate}, wav_data: {wav_data.shape}')
sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)
if debug: print(f'ensure_sample_rate, sample_rate: {sample_rate}, wav_data: {wav_data.shape}')
if debug: print(f'ensure_single_channel, wav_data.ndim: {wav_data.ndim}')
if wav_data.ndim >= 2: wav_data = wav_data[:, 0]
if debug: print(f'ensure_single_channel, wav_data: {wav_data.shape}')
if debug: print(f'ensured, wav_data: {wav_data}')
waveform = wav_data / tf.int16.max
# Run the model, check the output.
scores, embeddings, spectrogram = model(waveform)
scores_np = scores.numpy()
spectrogram_np = spectrogram.numpy()
scores_np_sorted = np.sort(scores_np.mean(axis=0))
scores_np_arg_sorted = np.argsort(scores_np.mean(axis=0))
class_index_array = [scores_np_arg_sorted[-1], scores_np_arg_sorted[-2], scores_np_arg_sorted[-3], scores_np_arg_sorted[-4], scores_np_arg_sorted[-5]]
infered_class = class_names[class_index_array[0]]
second_class = class_names[class_index_array[1]]
float_formatter = "{:.4f}".format
np.set_printoptions(formatter={'float_kind':float_formatter})
class_names_str = str(f'[{class_names[class_index_array[0]]}], [{class_names[class_index_array[1]]}], [{class_names[class_index_array[2]]}], [{class_names[class_index_array[3]]}], [{class_names[class_index_array[4]]}]')
# class_names_shape_str = str(len(class_names))
# scores_str = str(np.max(scores_np, axis=1)[:3])
scores_str = str('[{:.4f}'.format(scores_np_sorted[-1]) + '], [{:.4f}'.format(scores_np_sorted[-2]) + '], [{:.4f}'.format(scores_np_sorted[-3]) + '], [{:.4f}'.format(scores_np_sorted[-4]) + '], [{:.4f}'.format(scores_np_sorted[-5])) + ']'
# scores_shape_str = str(scores_np.shape)
return f'The main sound is: [{infered_class}], \n\nthe second sound is: [{second_class}]. \n\n classes: {class_names_str}, \n\n scores: {scores_str}'
def convMp3ToWav(wav_file_name):
src = wav_file_name
dst = wav_file_name + ".wav"
# convert wav to mp3
sound = AudioSegment.from_file(src)
sound.export(dst, format="wav")
return dst
examples = [['miaow_16k.wav']]
title = "yamnet"
description = "An audio event classifier trained on the AudioSet dataset to predict audio events from the AudioSet ontology."
gr.Interface(inference, gr.inputs.Audio(type="filepath"), "text", examples=examples, title=title,
description=description).launch(enable_queue=True)