Spaces:

thelou1s
/

yamnet

Sleeping

File size: 5,537 Bytes

3259b0d
 
 
 
 
 
 
 
3b0eb78
 
3259b0d
feeb2cc
3259b0d
9e91c7d
8726ca7
3259b0d
 
a040674
 
8726ca7
61feacf
 
8726ca7
 
61feacf
 
8726ca7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61feacf
a040674
3259b0d
 
 
3dffe1b
f8245e0
3259b0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34955e6
db62c12
 
655d8d0
38084b7
 
 
ab6e56e
655d8d0
 
db62c12
f845f9c
f8245e0
3259b0d
f8245e0
342a6ac
cbd4d9a
342a6ac
f845f9c
3259b0d
 
 
 
 
 
 
 
6781329
 
 
cffe116
 
 
 
3259b0d
7920743
5b99402
7884023
cffe116
4ae0adc
7884023
cffe116
32581d9
7884023
3259b0d
 
38084b7
 
 
 
 
 
 
 
 
3259b0d


# https://tfhub.dev/google/lite-model/yamnet/classification/tflite/1

import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv

# import matplotlib.pyplot as plt
# from IPython.display import Audio
from scipy.io import wavfile
import scipy

# import soundfile as sf
# import audio2numpy as a2n
import os

import gradio as gr

# import audio2numpy
# import numpy as np

from pydub import AudioSegment
from matplotlib import pyplot as plt


# https://stackoverflow.com/questions/53633177/how-to-read-a-mp3-audio-file-into-a-numpy-array-save-a-numpy-array-to-mp3
# def read(f, normalized=False):
#     """MP3 to numpy array"""
#     a = pydub.AudioSegment.from_mp3(f)
#     y = np.array(a.get_array_of_samples())
#     if a.channels == 2:
#         y = y.reshape((-1, 2))
#     if normalized:
#         return a.frame_rate, np.float32(y) / 2**15
#     else:
#         return a.frame_rate, y
#
#
# def write(f, sr, x, normalized=False):
#     """numpy array to MP3"""
#     channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
#     if normalized:  # normalized array - each item should be a float in [-1, 1)
#         y = np.int16(x * 2 ** 15)
#     else:
#         y = np.int16(x)
#     song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
#     song.export(f, format="mp3", bitrate="320k")


# Load the model.
model = hub.load('https://tfhub.dev/google/yamnet/1')

debug = True


# Find the name of the class with the top score when mean-aggregated across frames.
def class_names_from_csv(class_map_csv_text):
    """Returns list of class names corresponding to score vector."""
    class_names = []
    with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            class_names.append(row['display_name'])

    return class_names


class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)


def ensure_sample_rate(original_sample_rate, waveform,
                       desired_sample_rate=16000):
    """Resample waveform if required."""
    if original_sample_rate != desired_sample_rate:
        desired_length = int(round(float(len(waveform)) /
                                   original_sample_rate * desired_sample_rate))
        waveform = scipy.signal.resample(waveform, desired_length)
    return desired_sample_rate, waveform


os.system("wget https://storage.googleapis.com/audioset/miaow_16k.wav")


def inference(audio):
    # wav_file_name = 'speech_whistling2.wav'
    wav_file_name = audio
    if debug: print(f'read, wav_file_name: {wav_file_name}')

    if wav_file_name.endswith('.mp3'):
        # files
        new_wav = convMp3ToWav(wav_file_name)
        os.remove(wav_file_name)
        wav_file_name = new_wav
        if debug: print(f'covMp3ToWav, wav_file_name: {wav_file_name}')

    sample_rate, wav_data = wavfile.read(wav_file_name, 'rb')
    
    if debug: print(f'read, wav_data: {wav_data}')
    if debug: print(f'read, sample_rate: {sample_rate}, wav_data: {wav_data.shape}')
    sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)
    if debug: print(f'ensure_sample_rate, sample_rate: {sample_rate}, wav_data: {wav_data.shape}')
    if debug: print(f'ensure_single_channel, wav_data.ndim: {wav_data.ndim}')
    if wav_data.ndim >= 2: wav_data = wav_data[:, 0]
    if debug: print(f'ensure_single_channel, wav_data: {wav_data.shape}')
    if debug: print(f'ensured, wav_data: {wav_data}')

    waveform = wav_data / tf.int16.max

    # Run the model, check the output.
    scores, embeddings, spectrogram = model(waveform)

    scores_np = scores.numpy()
    spectrogram_np = spectrogram.numpy()

    scores_np_sorted = np.sort(scores_np.mean(axis=0))
    scores_np_arg_sorted = np.argsort(scores_np.mean(axis=0))

    class_index_array = [scores_np_arg_sorted[-1], scores_np_arg_sorted[-2], scores_np_arg_sorted[-3], scores_np_arg_sorted[-4], scores_np_arg_sorted[-5]]
    infered_class = class_names[class_index_array[0]]
    second_class = class_names[class_index_array[1]]

    float_formatter = "{:.4f}".format
    np.set_printoptions(formatter={'float_kind':float_formatter})
    class_names_str = str(f'[{class_names[class_index_array[0]]}], [{class_names[class_index_array[1]]}], [{class_names[class_index_array[2]]}], [{class_names[class_index_array[3]]}], [{class_names[class_index_array[4]]}]')
    # class_names_shape_str = str(len(class_names))
    # scores_str = str(np.max(scores_np, axis=1)[:3])
    scores_str = str('[{:.4f}'.format(scores_np_sorted[-1]) + '], [{:.4f}'.format(scores_np_sorted[-2]) + '], [{:.4f}'.format(scores_np_sorted[-3]) + '], [{:.4f}'.format(scores_np_sorted[-4]) + '], [{:.4f}'.format(scores_np_sorted[-5])) + ']'
    # scores_shape_str = str(scores_np.shape)
    
    return f'The main sound is: [{infered_class}], \n\nthe second sound is: [{second_class}]. \n\n classes: {class_names_str}, \n\n scores: {scores_str}'


def convMp3ToWav(wav_file_name):
    src = wav_file_name
    dst = wav_file_name + ".wav"
    # convert wav to mp3
    sound = AudioSegment.from_file(src)
    sound.export(dst, format="wav")
    return dst


examples = [['miaow_16k.wav']]
title = "yamnet"
description = "An audio event classifier trained on the AudioSet dataset to predict audio events from the AudioSet ontology."
gr.Interface(inference, gr.inputs.Audio(type="filepath"), "text", examples=examples, title=title,
             description=description).launch(enable_queue=True)