dennisvdang
/

chorus-detection

Model card Files Files and versions Community

chorus-detection / app.py

dennisvdang

Upload 2 files

856e7cd verified 13 days ago

raw history blame contribute delete

No virus

8.16 kB

	import os
	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow logs (must be set before importing TensorFlow)
	import tensorflow as tf
	tf.get_logger().setLevel('ERROR') # Suppress TensorFlow ERROR logs
	import warnings
	warnings.filterwarnings("ignore") # Suppress all warnings

	import argparse
	from functools import reduce
	from typing import List, Tuple
	import shutil
	import librosa
	import numpy as np
	from matplotlib import pyplot as plt
	from pydub import AudioSegment
	from pydub.silence import detect_nonsilent
	from pytube import YouTube
	from sklearn.preprocessing import StandardScaler
	import shutil
	import streamlit as st


	# Constants
	SR = 12000
	HOP_LENGTH = 128
	MAX_FRAMES = 300
	MAX_METERS = 201
	N_FEATURES = 15
	MODEL_PATH = "models/CRNN/best_model_V3.h5"
	AUDIO_TEMP_PATH = "output/temp"

	def extract_audio(url, output_path=AUDIO_TEMP_PATH):
	try:
	yt = YouTube(url)
	video_title = yt.title
	audio_stream = yt.streams.filter(only_audio=True).first()
	if audio_stream:
	os.makedirs(output_path, exist_ok=True)
	out_file = audio_stream.download(output_path)
	base, _ = os.path.splitext(out_file)
	audio_file = base + '.mp3'
	if os.path.exists(audio_file):
	os.remove(audio_file)
	os.rename(out_file, audio_file)
	return audio_file, video_title
	else:
	st.error("No audio stream found")
	return None, None
	except Exception as e:
	st.error(f"An error occurred: {e}")
	return None, None

	def strip_silence(audio_path):
	sound = AudioSegment.from_file(audio_path)
	nonsilent_ranges = detect_nonsilent(sound, min_silence_len=500, silence_thresh=-50)
	stripped = reduce(lambda acc, val: acc + sound[val[0]:val[1]], nonsilent_ranges, AudioSegment.empty())
	stripped.export(audio_path, format='mp3')

	class AudioFeature:
	def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
	self.audio_path = audio_path
	self.sr = sr
	self.hop_length = hop_length
	self.y = None
	self.y_harm, self.y_perc = None, None
	self.spectrogram = None
	self.rms = None
	self.melspectrogram = None
	self.mel_acts = None
	self.chromagram = None
	self.chroma_acts = None
	self.onset_env = None
	self.tempogram = None
	self.tempogram_acts = None
	self.mfccs = None
	self.mfcc_acts = None
	self.combined_features = None
	self.n_frames = None
	self.tempo = None
	self.beats = None
	self.meter_grid = None
	self.key, self.mode = None, None

	def detect_key(self, chroma_vals):
	note_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
	major_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
	minor_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
	major_profile /= np.linalg.norm(major_profile)
	minor_profile /= np.linalg.norm(minor_profile)

	major_correlations = [np.corrcoef(chroma_vals, np.roll(major_profile, i))[0, 1] for i in range(12)]
	minor_correlations = [np.corrcoef(chroma_vals, np.roll(minor_profile, i))[0, 1] for i in range(12)]

	max_major_idx = np.argmax(major_correlations)
	max_minor_idx = np.argmax(minor_correlations)

	self.mode = 'major' if major_correlations[max_major_idx] > minor_correlations[max_minor_idx] else 'minor'
	self.key = note_names[max_major_idx if self.mode == 'major' else max_minor_idx]
	return self.key, self.mode

	def calculate_ki_chroma(self, waveform, sr, hop_length):
	chromagram = librosa.feature.chroma_cqt(y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
	chromagram = (chromagram - chromagram.min()) / (chromagram.max() - chromagram.min())
	chroma_vals = np.sum(chromagram, axis=1)
	key, mode = self.detect_key(chroma_vals)
	key_idx = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'].index(key)
	shift_amount = -key_idx if mode == 'major' else -(key_idx + 3) % 12
	return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)

	def extract_features(self):
	self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
	self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
	self.spectrogram, _ = librosa.magphase(librosa.stft(self.y, hop_length=self.hop_length))
	self.rms = librosa.feature.rms(S=self.spectrogram, hop_length=self.hop_length).astype(np.float32)
	self.melspectrogram = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_mels=128, hop_length=self.hop_length).astype(np.float32)
	self.mel_acts = librosa.decompose.decompose(self.melspectrogram, n_components=3, sort=True)[1].astype(np.float32)
	self.chromagram = self.calculate_ki_chroma(self.y_harm, self.sr, self.hop_length).astype(np.float32)
	self.chroma_acts = librosa.decompose.decompose(self.chromagram, n_components=4, sort=True)[1].astype(np.float32)
	self.onset_env = librosa.onset.onset_strength(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
	self.tempogram = np.clip(librosa.feature.tempogram(onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length), 0, np.percentile(self.tempogram, 99)).astype(np.float32)
	self.tempogram_acts = librosa.decompose.decompose(self.tempogram, n_components=3, sort=True)[1].astype(np.float32)
	self.mfccs = librosa.feature.mfcc(y=self.y, sr=self.sr, n_mfcc=13, hop_length=self.hop_length).astype(np.float32)
	self.mfcc_acts = librosa.decompose.decompose(self.mfccs, n_components=3, sort=True)[1].astype(np.float32)
	self.combined_features = np.vstack([self.rms, self.mel_acts, self.chroma_acts, self.tempogram_acts, self.mfcc_acts])
	self.n_frames = self.combined_features.shape[1]
	self.tempo, self.beats = librosa.beat.beat_track(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
	self.meter_grid = librosa.util.fix_frames(librosa.util.frame(self.beats, frame_length=MAX_METERS, hop_length=1), x_min=0, x_max=self.n_frames)
	self.key, self.mode = self.detect_key(np.sum(self.chromagram, axis=1))

	def get_features(self):
	self.extract_features()
	return self.combined_features, self.n_frames, self.tempo, self.beats, self.meter_grid, self.key, self.mode

	def load_model(model_path=MODEL_PATH):
	return tf.keras.models.load_model(model_path)

	def predict_chorus(audio_features, model):
	features, n_frames, tempo, beats, meter_grid, key, mode = audio_features.get_features()
	features = features[:, :MAX_FRAMES]
	features = np.expand_dims(features, axis=0)
	scaler = StandardScaler()
	features = scaler.fit_transform(features.reshape(-1, features.shape[-1])).reshape(features.shape)
	predictions = model.predict(features)
	return predictions

	def plot_predictions(predictions, title):
	plt.figure(figsize=(10, 4))
	plt.plot(predictions[0], label='Chorus Probability')
	plt.title(title)
	plt.xlabel('Frame')
	plt.ylabel('Probability')
	plt.legend()
	st.pyplot(plt)

	def main():
	st.title("Chorus Finder")
	st.write("Upload a YouTube URL to find the chorus in the song.")
	url = st.text_input("YouTube URL")
	if st.button("Find Chorus"):
	if url:
	audio_file, video_title = extract_audio(url)
	if audio_file:
	strip_silence(audio_file)
	audio_features = AudioFeature(audio_file)
	model = load_model()
	predictions = predict_chorus(audio_features, model)
	plot_predictions(predictions, video_title)
	shutil.rmtree(AUDIO_TEMP_PATH)
	else:
	st.error("Please enter a valid YouTube URL")

	if __name__ == "__main__":
	main()