dennisvdang's picture
Upload 2 files
856e7cd verified
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow logs (must be set before importing TensorFlow)
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # Suppress TensorFlow ERROR logs
import warnings
warnings.filterwarnings("ignore") # Suppress all warnings
import argparse
from functools import reduce
from typing import List, Tuple
import shutil
import librosa
import numpy as np
from matplotlib import pyplot as plt
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
from pytube import YouTube
from sklearn.preprocessing import StandardScaler
import shutil
import streamlit as st
# Constants
SR = 12000
HOP_LENGTH = 128
MAX_FRAMES = 300
MAX_METERS = 201
N_FEATURES = 15
MODEL_PATH = "models/CRNN/best_model_V3.h5"
AUDIO_TEMP_PATH = "output/temp"
def extract_audio(url, output_path=AUDIO_TEMP_PATH):
try:
yt = YouTube(url)
video_title = yt.title
audio_stream = yt.streams.filter(only_audio=True).first()
if audio_stream:
os.makedirs(output_path, exist_ok=True)
out_file = audio_stream.download(output_path)
base, _ = os.path.splitext(out_file)
audio_file = base + '.mp3'
if os.path.exists(audio_file):
os.remove(audio_file)
os.rename(out_file, audio_file)
return audio_file, video_title
else:
st.error("No audio stream found")
return None, None
except Exception as e:
st.error(f"An error occurred: {e}")
return None, None
def strip_silence(audio_path):
sound = AudioSegment.from_file(audio_path)
nonsilent_ranges = detect_nonsilent(sound, min_silence_len=500, silence_thresh=-50)
stripped = reduce(lambda acc, val: acc + sound[val[0]:val[1]], nonsilent_ranges, AudioSegment.empty())
stripped.export(audio_path, format='mp3')
class AudioFeature:
def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
self.audio_path = audio_path
self.sr = sr
self.hop_length = hop_length
self.y = None
self.y_harm, self.y_perc = None, None
self.spectrogram = None
self.rms = None
self.melspectrogram = None
self.mel_acts = None
self.chromagram = None
self.chroma_acts = None
self.onset_env = None
self.tempogram = None
self.tempogram_acts = None
self.mfccs = None
self.mfcc_acts = None
self.combined_features = None
self.n_frames = None
self.tempo = None
self.beats = None
self.meter_grid = None
self.key, self.mode = None, None
def detect_key(self, chroma_vals):
note_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
major_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
minor_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
major_profile /= np.linalg.norm(major_profile)
minor_profile /= np.linalg.norm(minor_profile)
major_correlations = [np.corrcoef(chroma_vals, np.roll(major_profile, i))[0, 1] for i in range(12)]
minor_correlations = [np.corrcoef(chroma_vals, np.roll(minor_profile, i))[0, 1] for i in range(12)]
max_major_idx = np.argmax(major_correlations)
max_minor_idx = np.argmax(minor_correlations)
self.mode = 'major' if major_correlations[max_major_idx] > minor_correlations[max_minor_idx] else 'minor'
self.key = note_names[max_major_idx if self.mode == 'major' else max_minor_idx]
return self.key, self.mode
def calculate_ki_chroma(self, waveform, sr, hop_length):
chromagram = librosa.feature.chroma_cqt(y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
chromagram = (chromagram - chromagram.min()) / (chromagram.max() - chromagram.min())
chroma_vals = np.sum(chromagram, axis=1)
key, mode = self.detect_key(chroma_vals)
key_idx = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'].index(key)
shift_amount = -key_idx if mode == 'major' else -(key_idx + 3) % 12
return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)
def extract_features(self):
self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
self.spectrogram, _ = librosa.magphase(librosa.stft(self.y, hop_length=self.hop_length))
self.rms = librosa.feature.rms(S=self.spectrogram, hop_length=self.hop_length).astype(np.float32)
self.melspectrogram = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_mels=128, hop_length=self.hop_length).astype(np.float32)
self.mel_acts = librosa.decompose.decompose(self.melspectrogram, n_components=3, sort=True)[1].astype(np.float32)
self.chromagram = self.calculate_ki_chroma(self.y_harm, self.sr, self.hop_length).astype(np.float32)
self.chroma_acts = librosa.decompose.decompose(self.chromagram, n_components=4, sort=True)[1].astype(np.float32)
self.onset_env = librosa.onset.onset_strength(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
self.tempogram = np.clip(librosa.feature.tempogram(onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length), 0, np.percentile(self.tempogram, 99)).astype(np.float32)
self.tempogram_acts = librosa.decompose.decompose(self.tempogram, n_components=3, sort=True)[1].astype(np.float32)
self.mfccs = librosa.feature.mfcc(y=self.y, sr=self.sr, n_mfcc=13, hop_length=self.hop_length).astype(np.float32)
self.mfcc_acts = librosa.decompose.decompose(self.mfccs, n_components=3, sort=True)[1].astype(np.float32)
self.combined_features = np.vstack([self.rms, self.mel_acts, self.chroma_acts, self.tempogram_acts, self.mfcc_acts])
self.n_frames = self.combined_features.shape[1]
self.tempo, self.beats = librosa.beat.beat_track(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
self.meter_grid = librosa.util.fix_frames(librosa.util.frame(self.beats, frame_length=MAX_METERS, hop_length=1), x_min=0, x_max=self.n_frames)
self.key, self.mode = self.detect_key(np.sum(self.chromagram, axis=1))
def get_features(self):
self.extract_features()
return self.combined_features, self.n_frames, self.tempo, self.beats, self.meter_grid, self.key, self.mode
def load_model(model_path=MODEL_PATH):
return tf.keras.models.load_model(model_path)
def predict_chorus(audio_features, model):
features, n_frames, tempo, beats, meter_grid, key, mode = audio_features.get_features()
features = features[:, :MAX_FRAMES]
features = np.expand_dims(features, axis=0)
scaler = StandardScaler()
features = scaler.fit_transform(features.reshape(-1, features.shape[-1])).reshape(features.shape)
predictions = model.predict(features)
return predictions
def plot_predictions(predictions, title):
plt.figure(figsize=(10, 4))
plt.plot(predictions[0], label='Chorus Probability')
plt.title(title)
plt.xlabel('Frame')
plt.ylabel('Probability')
plt.legend()
st.pyplot(plt)
def main():
st.title("Chorus Finder")
st.write("Upload a YouTube URL to find the chorus in the song.")
url = st.text_input("YouTube URL")
if st.button("Find Chorus"):
if url:
audio_file, video_title = extract_audio(url)
if audio_file:
strip_silence(audio_file)
audio_features = AudioFeature(audio_file)
model = load_model()
predictions = predict_chorus(audio_features, model)
plot_predictions(predictions, video_title)
shutil.rmtree(AUDIO_TEMP_PATH)
else:
st.error("Please enter a valid YouTube URL")
if __name__ == "__main__":
main()