import numpy as np import tensorflow as tf from scipy.io.wavfile import write import keras.backend as K import librosa.display import cv2 import librosa import matplotlib.pyplot as plt import librosa.display import numpy as np from keras.applications import VGG16 import os import scipy import gradio as gr # Load the tune recognition model model = tf.keras.models.load_model('embdmodel_1.hdf5') embedding_model=model.layers[2] DURATION = 10 WAVE_OUTPUT_FILE = "my_audio.wav" # Define function to preprocess input audio #convert song to mel spectogram as siamese network doesn't work on sound directly def create_spectrogram(clip,sample_rate,save_path): plt.interactive(False) fig=plt.figure(figsize=[0.72,0.72]) S=librosa.feature.melspectrogram(y=clip,sr=sample_rate) librosa.display.specshow(librosa.power_to_db(S,ref=np.max)) fig.savefig(save_path,dpi=400,bbox_inches='tight',pad_inches=0) plt.close() fig.clf() plt.close(fig) plt.close('all') del save_path,clip,sample_rate,fig,S def load_img(path): img=cv2.imread(path) img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB) img=cv2.resize(img,(150,150)) return img import pickle with open('dict.pickle', 'rb') as handle: songspecdict = pickle.load(handle) def list_file_sizes(): path = "." # Get list of all files only in the given directory fun = lambda x : os.path.isfile(os.path.join(path,x)) files_list = filter(fun, os.listdir(path)) # Create a list of files in directory along with the size size_of_file = [ (f,os.stat(os.path.join(path, f)).st_size) for f in files_list ] # Iterate over list of files along with size # and print them one by one. for f,s in size_of_file: print("{} : {}MB".format(f, round(s/(1024*1024),3))) def main(audio): with open(WAVE_OUTPUT_FILE, "wb") as file: file.write(audio) list_file_sizes() # Load the song to match song, sr = librosa.load("my_audio.wav") to_match = np.copy(song[0:220500]) print("Loaded data into librosa...") # Create spectrogram image of the song to match create_spectrogram(to_match, sr, 'test.png') print("Created spectogram...") # Load the spectrogram image of the song to match to_match_img = load_img('test.png') to_match_img = np.expand_dims(to_match_img, axis=0) print("Loaded spectrum image...") # Get the embedding of the song to match to_match_emb = embedding_model.predict(to_match_img) print("Get song embedding...") # Calculate the distances between the song to match and the songs in the database songsdistdict = {} for key, values in songspecdict.items(): dist_array = [] for embd in values: dist_array.append(np.linalg.norm(to_match_emb - embd)) songsdistdict[key] = min(dist_array) song_titles=list(songsdistdict.keys()) distances=list(songsdistdict.values()) # Get the title and artist of the recognized song recognized_song_artist, recognized_song_title = song_titles[distances.index(min(distances))].split('-') recognized_song_title = os.path.splitext(recognized_song_title)[0] print(f'Artist: {recognized_song_artist}') print(f'Title: {recognized_song_title}') from musixmatch import Musixmatch # Initialize Musixmatch API musixmatch = Musixmatch(apikey='2b0d0615efa782e95598a0e99bda4a60') # Search for the recognized song track_search_results = musixmatch.track_search(q_track=recognized_song_title, q_artist=recognized_song_artist, page_size=1, page=1, s_track_rating='desc') if track_search_results['message']['header']['status_code'] == 200: # Get the track ID for the top result track_id = track_search_results['message']['body']['track_list'][0]['track']['track_id'] # Get the lyrics for the recognized song lyrics_result = musixmatch.track_lyrics_get(track_id=track_id) if lyrics_result['message']['header']['status_code'] == 200: # Get the lyrics lyrics = lyrics_result['message']['body']['lyrics']['lyrics_body'] # Remove the annotation tags from the lyrics lyrics = lyrics.replace('******* This Lyrics is NOT for Commercial use *******', '').strip() print("Lyrics:\n", lyrics) else: print("Couldn't find lyrics for the recognized song.") # Play the recognized song recognized_song_file = f'https://huggingface.co/spaces/prerna9811/Chord/tree/main/seismese_net_songs/{song_titles[distances.index(min(distances))]}' recognized_song_audio, recognized_song_sr = librosa.load(recognized_song_file) audio_file = open(recognized_song_file, 'rb') # enter the filename with filepath audio_bytes = audio_file.read() # reading the file return audio_bytes css = """ footer {display:none !important} .output-markdown{display:none !important} button.primary { z-index: 14; left: 0px; top: 0px; cursor: pointer !important; background: none rgb(17, 20, 45) !important; border: none !important; color: rgb(255, 255, 255) !important; line-height: 1 !important; border-radius: 6px !important; transition: box-shadow 200ms ease 0s, background 200ms ease 0s !important; box-shadow: none !important; } button.primary:hover{ z-index: 14; left: 0px; top: 0px; cursor: pointer !important; background: none rgb(37, 56, 133) !important; border: none !important; color: rgb(255, 255, 255) !important; line-height: 1 !important; border-radius: 6px !important; transition: box-shadow 200ms ease 0s, background 200ms ease 0s !important; box-shadow: rgb(0 0 0 / 23%) 0px 1px 7px 0px !important; } button.gallery-item:hover { border-color: rgb(37 56 133) !important; background-color: rgb(229,225,255) !important; } """ import asyncio loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) demo = gr.Blocks() mf_transcribe = gr.Interface( fn=main, inputs=gr.inputs.Audio(source="microphone", type="filepath"), outputs="audio", layout="horizontal", theme="huggingface", allow_flagging="never", css = css ) mf_transcribe.launch()