File size: 5,537 Bytes
3259b0d 3b0eb78 3259b0d feeb2cc 3259b0d 9e91c7d 8726ca7 3259b0d a040674 8726ca7 61feacf 8726ca7 61feacf 8726ca7 61feacf a040674 3259b0d 3dffe1b f8245e0 3259b0d 34955e6 db62c12 655d8d0 38084b7 ab6e56e 655d8d0 db62c12 f845f9c f8245e0 3259b0d f8245e0 342a6ac cbd4d9a 342a6ac f845f9c 3259b0d 6781329 cffe116 3259b0d 7920743 5b99402 7884023 cffe116 4ae0adc 7884023 cffe116 32581d9 7884023 3259b0d 38084b7 3259b0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# https://tfhub.dev/google/lite-model/yamnet/classification/tflite/1
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
# import matplotlib.pyplot as plt
# from IPython.display import Audio
from scipy.io import wavfile
import scipy
# import soundfile as sf
# import audio2numpy as a2n
import os
import gradio as gr
# import audio2numpy
# import numpy as np
from pydub import AudioSegment
from matplotlib import pyplot as plt
# https://stackoverflow.com/questions/53633177/how-to-read-a-mp3-audio-file-into-a-numpy-array-save-a-numpy-array-to-mp3
# def read(f, normalized=False):
# """MP3 to numpy array"""
# a = pydub.AudioSegment.from_mp3(f)
# y = np.array(a.get_array_of_samples())
# if a.channels == 2:
# y = y.reshape((-1, 2))
# if normalized:
# return a.frame_rate, np.float32(y) / 2**15
# else:
# return a.frame_rate, y
#
#
# def write(f, sr, x, normalized=False):
# """numpy array to MP3"""
# channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
# if normalized: # normalized array - each item should be a float in [-1, 1)
# y = np.int16(x * 2 ** 15)
# else:
# y = np.int16(x)
# song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
# song.export(f, format="mp3", bitrate="320k")
# Load the model.
model = hub.load('https://tfhub.dev/google/yamnet/1')
debug = True
# Find the name of the class with the top score when mean-aggregated across frames.
def class_names_from_csv(class_map_csv_text):
"""Returns list of class names corresponding to score vector."""
class_names = []
with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
class_names.append(row['display_name'])
return class_names
class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)
def ensure_sample_rate(original_sample_rate, waveform,
desired_sample_rate=16000):
"""Resample waveform if required."""
if original_sample_rate != desired_sample_rate:
desired_length = int(round(float(len(waveform)) /
original_sample_rate * desired_sample_rate))
waveform = scipy.signal.resample(waveform, desired_length)
return desired_sample_rate, waveform
os.system("wget https://storage.googleapis.com/audioset/miaow_16k.wav")
def inference(audio):
# wav_file_name = 'speech_whistling2.wav'
wav_file_name = audio
if debug: print(f'read, wav_file_name: {wav_file_name}')
if wav_file_name.endswith('.mp3'):
# files
new_wav = convMp3ToWav(wav_file_name)
os.remove(wav_file_name)
wav_file_name = new_wav
if debug: print(f'covMp3ToWav, wav_file_name: {wav_file_name}')
sample_rate, wav_data = wavfile.read(wav_file_name, 'rb')
if debug: print(f'read, wav_data: {wav_data}')
if debug: print(f'read, sample_rate: {sample_rate}, wav_data: {wav_data.shape}')
sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)
if debug: print(f'ensure_sample_rate, sample_rate: {sample_rate}, wav_data: {wav_data.shape}')
if debug: print(f'ensure_single_channel, wav_data.ndim: {wav_data.ndim}')
if wav_data.ndim >= 2: wav_data = wav_data[:, 0]
if debug: print(f'ensure_single_channel, wav_data: {wav_data.shape}')
if debug: print(f'ensured, wav_data: {wav_data}')
waveform = wav_data / tf.int16.max
# Run the model, check the output.
scores, embeddings, spectrogram = model(waveform)
scores_np = scores.numpy()
spectrogram_np = spectrogram.numpy()
scores_np_sorted = np.sort(scores_np.mean(axis=0))
scores_np_arg_sorted = np.argsort(scores_np.mean(axis=0))
class_index_array = [scores_np_arg_sorted[-1], scores_np_arg_sorted[-2], scores_np_arg_sorted[-3], scores_np_arg_sorted[-4], scores_np_arg_sorted[-5]]
infered_class = class_names[class_index_array[0]]
second_class = class_names[class_index_array[1]]
float_formatter = "{:.4f}".format
np.set_printoptions(formatter={'float_kind':float_formatter})
class_names_str = str(f'[{class_names[class_index_array[0]]}], [{class_names[class_index_array[1]]}], [{class_names[class_index_array[2]]}], [{class_names[class_index_array[3]]}], [{class_names[class_index_array[4]]}]')
# class_names_shape_str = str(len(class_names))
# scores_str = str(np.max(scores_np, axis=1)[:3])
scores_str = str('[{:.4f}'.format(scores_np_sorted[-1]) + '], [{:.4f}'.format(scores_np_sorted[-2]) + '], [{:.4f}'.format(scores_np_sorted[-3]) + '], [{:.4f}'.format(scores_np_sorted[-4]) + '], [{:.4f}'.format(scores_np_sorted[-5])) + ']'
# scores_shape_str = str(scores_np.shape)
return f'The main sound is: [{infered_class}], \n\nthe second sound is: [{second_class}]. \n\n classes: {class_names_str}, \n\n scores: {scores_str}'
def convMp3ToWav(wav_file_name):
src = wav_file_name
dst = wav_file_name + ".wav"
# convert wav to mp3
sound = AudioSegment.from_file(src)
sound.export(dst, format="wav")
return dst
examples = [['miaow_16k.wav']]
title = "yamnet"
description = "An audio event classifier trained on the AudioSet dataset to predict audio events from the AudioSet ontology."
gr.Interface(inference, gr.inputs.Audio(type="filepath"), "text", examples=examples, title=title,
description=description).launch(enable_queue=True)
|