Spaces:

thelou1s
/

yamnet

Sleeping

yamnet / app.py

Luis

print readable

7884023 over 2 years ago

5.54 kB


	# https://tfhub.dev/google/lite-model/yamnet/classification/tflite/1

	import tensorflow as tf
	import tensorflow_hub as hub
	import numpy as np
	import csv

	# import matplotlib.pyplot as plt
	# from IPython.display import Audio
	from scipy.io import wavfile
	import scipy

	# import soundfile as sf
	# import audio2numpy as a2n
	import os

	import gradio as gr

	# import audio2numpy
	# import numpy as np

	from pydub import AudioSegment
	from matplotlib import pyplot as plt


	# https://stackoverflow.com/questions/53633177/how-to-read-a-mp3-audio-file-into-a-numpy-array-save-a-numpy-array-to-mp3
	# def read(f, normalized=False):
	# """MP3 to numpy array"""
	# a = pydub.AudioSegment.from_mp3(f)
	# y = np.array(a.get_array_of_samples())
	# if a.channels == 2:
	# y = y.reshape((-1, 2))
	# if normalized:
	# return a.frame_rate, np.float32(y) / 2**15
	# else:
	# return a.frame_rate, y
	#
	#
	# def write(f, sr, x, normalized=False):
	# """numpy array to MP3"""
	# channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
	# if normalized: # normalized array - each item should be a float in [-1, 1)
	# y = np.int16(x * 2 ** 15)
	# else:
	# y = np.int16(x)
	# song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
	# song.export(f, format="mp3", bitrate="320k")


	# Load the model.
	model = hub.load('https://tfhub.dev/google/yamnet/1')

	debug = True


	# Find the name of the class with the top score when mean-aggregated across frames.
	def class_names_from_csv(class_map_csv_text):
	"""Returns list of class names corresponding to score vector."""
	class_names = []
	with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
	reader = csv.DictReader(csvfile)
	for row in reader:
	class_names.append(row['display_name'])

	return class_names


	class_map_path = model.class_map_path().numpy()
	class_names = class_names_from_csv(class_map_path)


	def ensure_sample_rate(original_sample_rate, waveform,
	desired_sample_rate=16000):
	"""Resample waveform if required."""
	if original_sample_rate != desired_sample_rate:
	desired_length = int(round(float(len(waveform)) /
	original_sample_rate * desired_sample_rate))
	waveform = scipy.signal.resample(waveform, desired_length)
	return desired_sample_rate, waveform


	os.system("wget https://storage.googleapis.com/audioset/miaow_16k.wav")


	def inference(audio):
	# wav_file_name = 'speech_whistling2.wav'
	wav_file_name = audio
	if debug: print(f'read, wav_file_name: {wav_file_name}')

	if wav_file_name.endswith('.mp3'):
	# files
	new_wav = convMp3ToWav(wav_file_name)
	os.remove(wav_file_name)
	wav_file_name = new_wav
	if debug: print(f'covMp3ToWav, wav_file_name: {wav_file_name}')

	sample_rate, wav_data = wavfile.read(wav_file_name, 'rb')

	if debug: print(f'read, wav_data: {wav_data}')
	if debug: print(f'read, sample_rate: {sample_rate}, wav_data: {wav_data.shape}')
	sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)
	if debug: print(f'ensure_sample_rate, sample_rate: {sample_rate}, wav_data: {wav_data.shape}')
	if debug: print(f'ensure_single_channel, wav_data.ndim: {wav_data.ndim}')
	if wav_data.ndim >= 2: wav_data = wav_data[:, 0]
	if debug: print(f'ensure_single_channel, wav_data: {wav_data.shape}')
	if debug: print(f'ensured, wav_data: {wav_data}')

	waveform = wav_data / tf.int16.max

	# Run the model, check the output.
	scores, embeddings, spectrogram = model(waveform)

	scores_np = scores.numpy()
	spectrogram_np = spectrogram.numpy()

	scores_np_sorted = np.sort(scores_np.mean(axis=0))
	scores_np_arg_sorted = np.argsort(scores_np.mean(axis=0))

	class_index_array = [scores_np_arg_sorted[-1], scores_np_arg_sorted[-2], scores_np_arg_sorted[-3], scores_np_arg_sorted[-4], scores_np_arg_sorted[-5]]
	infered_class = class_names[class_index_array[0]]
	second_class = class_names[class_index_array[1]]

	float_formatter = "{:.4f}".format
	np.set_printoptions(formatter={'float_kind':float_formatter})
	class_names_str = str(f'[{class_names[class_index_array[0]]}], [{class_names[class_index_array[1]]}], [{class_names[class_index_array[2]]}], [{class_names[class_index_array[3]]}], [{class_names[class_index_array[4]]}]')
	# class_names_shape_str = str(len(class_names))
	# scores_str = str(np.max(scores_np, axis=1)[:3])
	scores_str = str('[{:.4f}'.format(scores_np_sorted[-1]) + '], [{:.4f}'.format(scores_np_sorted[-2]) + '], [{:.4f}'.format(scores_np_sorted[-3]) + '], [{:.4f}'.format(scores_np_sorted[-4]) + '], [{:.4f}'.format(scores_np_sorted[-5])) + ']'
	# scores_shape_str = str(scores_np.shape)

	return f'The main sound is: [{infered_class}], \n\nthe second sound is: [{second_class}]. \n\n classes: {class_names_str}, \n\n scores: {scores_str}'


	def convMp3ToWav(wav_file_name):
	src = wav_file_name
	dst = wav_file_name + ".wav"
	# convert wav to mp3
	sound = AudioSegment.from_file(src)
	sound.export(dst, format="wav")
	return dst


	examples = [['miaow_16k.wav']]
	title = "yamnet"
	description = "An audio event classifier trained on the AudioSet dataset to predict audio events from the AudioSet ontology."
	gr.Interface(inference, gr.inputs.Audio(type="filepath"), "text", examples=examples, title=title,
	description=description).launch(enable_queue=True)