import gradio as gr import os import allin1 import time import json import torch import librosa import numpy as np from pathlib import Path HEADER = """

All-In-One Music Structure Analyzer 🔮

[Python Package] [Paper] [Visual Demo]

This Space demonstrates the music structure analyzer predicts:

For more information, please visit the links above ✨🧸

""" CACHE_EXAMPLES = os.getenv('CACHE_EXAMPLES', '1') == '1' base_dir = "/tmp/gradio/" # Defining sample rate for voice activity detection (must use multiple of 8k) SAMPLING_RATE = 32000 torch.set_num_threads(1) # Import of models to do voice detection model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=True) (get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils def analyze(path): #Measure time for inference start = time.time() string_path = path path = Path(path) result= allin1.analyze( path, out_dir='./struct', multiprocess=False, keep_byproducts=True, # TODO: remove this ) json_structure_output = None for root, dirs, files in os.walk(f"./struct"): for file_path in files: json_structure_output = os.path.join(root, file_path) print(json_structure_output) add_voice_labelv2(json_structure_output, string_path) fig = allin1.visualize( result, multiprocess=False, ) fig.set_dpi(300) #allin1.sonify( # result, # out_dir='./sonif', # multiprocess=False, #) #sonif_path = Path(f'./sonif/{path.stem}.sonif{path.suffix}').resolve().as_posix() #Measure time for inference end = time.time() elapsed_time = end-start # Get the base name of the file file_name = os.path.basename(path) # Remove the extension from the file name file_name_without_extension = os.path.splitext(file_name)[0] print(file_name_without_extension) bass_path, drums_path, other_path, vocals_path = None, None, None, None for root, dirs, files in os.walk(f"./demix/htdemucs/{file_name_without_extension}"): for file_path in files: file_path = os.path.join(root, file_path) print(file_path) if "bass.wav" in file_path: bass_path = file_path if "vocals.wav" in file_path: vocals_path = file_path if "other.wav" in file_path: other_path = file_path if "drums.wav" in file_path: drums_path = file_path #return result.bpm, fig, sonif_path, elapsed_time return result.bpm, fig, elapsed_time, json_structure_output, bass_path, drums_path, other_path, vocals_path def add_voice_label(json_file, audio_path): # Load the JSON file with open(json_file, 'r') as f: data = json.load(f) # Create VAD object vad_iterator = VADIterator(model) # Read input audio file wav, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True) # Access the segments segments = data['segments'] times = [] for segment in segments: start = segment['start'] end = segment['end'] start_sample = int(start*SAMPLING_RATE) end_sample = int(end*SAMPLING_RATE) speech_probs = [] window_size_samples = 1536 for i in range(start_sample, end_sample, window_size_samples): chunk = torch.from_numpy(wav[i: i+ window_size_samples]) if len(chunk) < window_size_samples: break speech_prob = model(chunk, SAMPLING_RATE).item() speech_probs.append(speech_prob) vad_iterator.reset_states() # reset model states after each audio mean_probability = np.mean(speech_probs) print(mean_probability) if mean_probability >= 0.7 : segment['voice'] = "Yes" else: segment['voice'] = "No" with open(json_file, 'w') as f: json.dump(data, f, indent=4) def add_voice_labelv2(json_file, audio_path): # Load the JSON file with open(json_file, 'r') as f: data = json.load(f) # Create VAD object vad_iterator = VADIterator(model) # Read input audio file wav, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True) speech_probs = [] # Size of the window we compute the probability on window_size_samples = int(SAMPLING_RATE/4) for i in range(0, len(wav), window_size_samples): chunk = torch.from_numpy(wav[i: i+ window_size_samples]) if len(chunk) < window_size_samples: break speech_prob = model(chunk, SAMPLING_RATE).item() speech_probs.append(speech_prob) vad_iterator.reset_states() # reset model states after each audio voice_idxs = np.where(np.array(speech_probs) >= 0.7)[0] print(len(voice_idxs)) if len(voice_idxs) == 0: print("NO VOICE SEGMENTS DETECTED!") try: begin_seq = True start_idx = 0 vocal_times=[] for i in range(len(voice_idxs)-1): if begin_seq: start_idx = voice_idxs[i] begin_seq = False if voice_idxs[i+1] == voice_idxs[i]+1: continue start_time = float((start_idx*window_size_samples)/SAMPLING_RATE) end_time = float((voice_idxs[i]*window_size_samples)/SAMPLING_RATE) start_minutes = int(start_time) end_minutes = int(end_time) start_seconds = (start_time - start_minutes) * 60 end_seconds = (end_time - end_minutes) * 60 print("modifying json data... \n") vocal_times.append( { "start_time": f"{start_minutes}.{start_seconds:.0f}", "end_time": f"{end_minutes}.{end_seconds:.0f}" }) begin_seq = True data['vocal_times'] = vocal_times except Exception as e: print(f"An exception occurred: {e}") with open(json_file, 'w') as f: print("writing_to_json...") json.dump(data, f, indent=4) with gr.Blocks() as demo: gr.HTML(HEADER) input_audio_path = gr.Audio( label='Input', type='filepath', format='mp3', show_download_button=False, ) button = gr.Button('Analyze', variant='primary') output_viz = gr.Plot(label='Visualization') with gr.Row(): output_bpm = gr.Textbox(label='BPM', scale=1) #output_sonif = gr.Audio( # label='Sonification', # type='filepath', # format='mp3', # show_download_button=False, # scale=9, #) elapsed_time = gr.Textbox(label='Overall inference time', scale=1) json_structure_output = gr.File(label="Json structure") with gr.Column(): bass = gr.Audio(label='bass', show_share_button=False) vocals =gr.Audio(label='vocals', show_share_button=False) other = gr.Audio(label='other', show_share_button=False) drums =gr.Audio(label='drums', show_share_button=False) #bass_path = gr.Textbox(label='bass_path', scale=1) #drums_path = gr.Textbox(label='drums_path', scale=1) #other_path = gr.Textbox(label='other_path', scale=1) #vocals_path = gr.Textbox(label='vocals_path', scale=1) #gr.Examples( # examples=[ # './assets/NewJeans - Super Shy.mp3', # './assets/Bruno Mars - 24k Magic.mp3' # ], # inputs=input_audio_path, # outputs=[output_bpm, output_viz, output_sonif], # fn=analyze, # cache_examples=CACHE_EXAMPLES, #) button.click( fn=analyze, inputs=input_audio_path, #outputs=[output_bpm, output_viz, output_sonif, elapsed_time], outputs=[output_bpm, output_viz, elapsed_time, json_structure_output, bass, drums, other, vocals], api_name='analyze', ) if __name__ == '__main__': demo.launch()