|
import os
|
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
|
|
import tensorflow as tf
|
|
tf.get_logger().setLevel('ERROR')
|
|
import warnings
|
|
warnings.filterwarnings("ignore")
|
|
|
|
import argparse
|
|
from functools import reduce
|
|
from typing import List, Tuple
|
|
import shutil
|
|
import librosa
|
|
import numpy as np
|
|
from matplotlib import pyplot as plt
|
|
from pydub import AudioSegment
|
|
from pydub.silence import detect_nonsilent
|
|
from pytube import YouTube
|
|
from sklearn.preprocessing import StandardScaler
|
|
import shutil
|
|
import streamlit as st
|
|
|
|
|
|
|
|
SR = 12000
|
|
HOP_LENGTH = 128
|
|
MAX_FRAMES = 300
|
|
MAX_METERS = 201
|
|
N_FEATURES = 15
|
|
MODEL_PATH = "models/CRNN/best_model_V3.h5"
|
|
AUDIO_TEMP_PATH = "output/temp"
|
|
|
|
def extract_audio(url, output_path=AUDIO_TEMP_PATH):
|
|
try:
|
|
yt = YouTube(url)
|
|
video_title = yt.title
|
|
audio_stream = yt.streams.filter(only_audio=True).first()
|
|
if audio_stream:
|
|
os.makedirs(output_path, exist_ok=True)
|
|
out_file = audio_stream.download(output_path)
|
|
base, _ = os.path.splitext(out_file)
|
|
audio_file = base + '.mp3'
|
|
if os.path.exists(audio_file):
|
|
os.remove(audio_file)
|
|
os.rename(out_file, audio_file)
|
|
return audio_file, video_title
|
|
else:
|
|
st.error("No audio stream found")
|
|
return None, None
|
|
except Exception as e:
|
|
st.error(f"An error occurred: {e}")
|
|
return None, None
|
|
|
|
def strip_silence(audio_path):
|
|
sound = AudioSegment.from_file(audio_path)
|
|
nonsilent_ranges = detect_nonsilent(sound, min_silence_len=500, silence_thresh=-50)
|
|
stripped = reduce(lambda acc, val: acc + sound[val[0]:val[1]], nonsilent_ranges, AudioSegment.empty())
|
|
stripped.export(audio_path, format='mp3')
|
|
|
|
class AudioFeature:
|
|
def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
|
|
self.audio_path = audio_path
|
|
self.sr = sr
|
|
self.hop_length = hop_length
|
|
self.y = None
|
|
self.y_harm, self.y_perc = None, None
|
|
self.spectrogram = None
|
|
self.rms = None
|
|
self.melspectrogram = None
|
|
self.mel_acts = None
|
|
self.chromagram = None
|
|
self.chroma_acts = None
|
|
self.onset_env = None
|
|
self.tempogram = None
|
|
self.tempogram_acts = None
|
|
self.mfccs = None
|
|
self.mfcc_acts = None
|
|
self.combined_features = None
|
|
self.n_frames = None
|
|
self.tempo = None
|
|
self.beats = None
|
|
self.meter_grid = None
|
|
self.key, self.mode = None, None
|
|
|
|
def detect_key(self, chroma_vals):
|
|
note_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
|
|
major_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
|
|
minor_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
|
|
major_profile /= np.linalg.norm(major_profile)
|
|
minor_profile /= np.linalg.norm(minor_profile)
|
|
|
|
major_correlations = [np.corrcoef(chroma_vals, np.roll(major_profile, i))[0, 1] for i in range(12)]
|
|
minor_correlations = [np.corrcoef(chroma_vals, np.roll(minor_profile, i))[0, 1] for i in range(12)]
|
|
|
|
max_major_idx = np.argmax(major_correlations)
|
|
max_minor_idx = np.argmax(minor_correlations)
|
|
|
|
self.mode = 'major' if major_correlations[max_major_idx] > minor_correlations[max_minor_idx] else 'minor'
|
|
self.key = note_names[max_major_idx if self.mode == 'major' else max_minor_idx]
|
|
return self.key, self.mode
|
|
|
|
def calculate_ki_chroma(self, waveform, sr, hop_length):
|
|
chromagram = librosa.feature.chroma_cqt(y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
|
|
chromagram = (chromagram - chromagram.min()) / (chromagram.max() - chromagram.min())
|
|
chroma_vals = np.sum(chromagram, axis=1)
|
|
key, mode = self.detect_key(chroma_vals)
|
|
key_idx = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'].index(key)
|
|
shift_amount = -key_idx if mode == 'major' else -(key_idx + 3) % 12
|
|
return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)
|
|
|
|
def extract_features(self):
|
|
self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
|
|
self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
|
|
self.spectrogram, _ = librosa.magphase(librosa.stft(self.y, hop_length=self.hop_length))
|
|
self.rms = librosa.feature.rms(S=self.spectrogram, hop_length=self.hop_length).astype(np.float32)
|
|
self.melspectrogram = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_mels=128, hop_length=self.hop_length).astype(np.float32)
|
|
self.mel_acts = librosa.decompose.decompose(self.melspectrogram, n_components=3, sort=True)[1].astype(np.float32)
|
|
self.chromagram = self.calculate_ki_chroma(self.y_harm, self.sr, self.hop_length).astype(np.float32)
|
|
self.chroma_acts = librosa.decompose.decompose(self.chromagram, n_components=4, sort=True)[1].astype(np.float32)
|
|
self.onset_env = librosa.onset.onset_strength(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
|
|
self.tempogram = np.clip(librosa.feature.tempogram(onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length), 0, np.percentile(self.tempogram, 99)).astype(np.float32)
|
|
self.tempogram_acts = librosa.decompose.decompose(self.tempogram, n_components=3, sort=True)[1].astype(np.float32)
|
|
self.mfccs = librosa.feature.mfcc(y=self.y, sr=self.sr, n_mfcc=13, hop_length=self.hop_length).astype(np.float32)
|
|
self.mfcc_acts = librosa.decompose.decompose(self.mfccs, n_components=3, sort=True)[1].astype(np.float32)
|
|
self.combined_features = np.vstack([self.rms, self.mel_acts, self.chroma_acts, self.tempogram_acts, self.mfcc_acts])
|
|
self.n_frames = self.combined_features.shape[1]
|
|
self.tempo, self.beats = librosa.beat.beat_track(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
|
|
self.meter_grid = librosa.util.fix_frames(librosa.util.frame(self.beats, frame_length=MAX_METERS, hop_length=1), x_min=0, x_max=self.n_frames)
|
|
self.key, self.mode = self.detect_key(np.sum(self.chromagram, axis=1))
|
|
|
|
def get_features(self):
|
|
self.extract_features()
|
|
return self.combined_features, self.n_frames, self.tempo, self.beats, self.meter_grid, self.key, self.mode
|
|
|
|
def load_model(model_path=MODEL_PATH):
|
|
return tf.keras.models.load_model(model_path)
|
|
|
|
def predict_chorus(audio_features, model):
|
|
features, n_frames, tempo, beats, meter_grid, key, mode = audio_features.get_features()
|
|
features = features[:, :MAX_FRAMES]
|
|
features = np.expand_dims(features, axis=0)
|
|
scaler = StandardScaler()
|
|
features = scaler.fit_transform(features.reshape(-1, features.shape[-1])).reshape(features.shape)
|
|
predictions = model.predict(features)
|
|
return predictions
|
|
|
|
def plot_predictions(predictions, title):
|
|
plt.figure(figsize=(10, 4))
|
|
plt.plot(predictions[0], label='Chorus Probability')
|
|
plt.title(title)
|
|
plt.xlabel('Frame')
|
|
plt.ylabel('Probability')
|
|
plt.legend()
|
|
st.pyplot(plt)
|
|
|
|
def main():
|
|
st.title("Chorus Finder")
|
|
st.write("Upload a YouTube URL to find the chorus in the song.")
|
|
url = st.text_input("YouTube URL")
|
|
if st.button("Find Chorus"):
|
|
if url:
|
|
audio_file, video_title = extract_audio(url)
|
|
if audio_file:
|
|
strip_silence(audio_file)
|
|
audio_features = AudioFeature(audio_file)
|
|
model = load_model()
|
|
predictions = predict_chorus(audio_features, model)
|
|
plot_predictions(predictions, video_title)
|
|
shutil.rmtree(AUDIO_TEMP_PATH)
|
|
else:
|
|
st.error("Please enter a valid YouTube URL")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|