import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow logs (must be set before importing TensorFlow) import tensorflow as tf tf.get_logger().setLevel('ERROR') # Suppress TensorFlow ERROR logs import warnings warnings.filterwarnings("ignore") # Suppress all warnings import argparse from functools import reduce from typing import List, Tuple import shutil import librosa import numpy as np from matplotlib import pyplot as plt from pydub import AudioSegment from pydub.silence import detect_nonsilent from pytube import YouTube from sklearn.preprocessing import StandardScaler import shutil import streamlit as st # Constants SR = 12000 HOP_LENGTH = 128 MAX_FRAMES = 300 MAX_METERS = 201 N_FEATURES = 15 MODEL_PATH = "models/CRNN/best_model_V3.h5" AUDIO_TEMP_PATH = "output/temp" def extract_audio(url, output_path=AUDIO_TEMP_PATH): try: yt = YouTube(url) video_title = yt.title audio_stream = yt.streams.filter(only_audio=True).first() if audio_stream: os.makedirs(output_path, exist_ok=True) out_file = audio_stream.download(output_path) base, _ = os.path.splitext(out_file) audio_file = base + '.mp3' if os.path.exists(audio_file): os.remove(audio_file) os.rename(out_file, audio_file) return audio_file, video_title else: st.error("No audio stream found") return None, None except Exception as e: st.error(f"An error occurred: {e}") return None, None def strip_silence(audio_path): sound = AudioSegment.from_file(audio_path) nonsilent_ranges = detect_nonsilent(sound, min_silence_len=500, silence_thresh=-50) stripped = reduce(lambda acc, val: acc + sound[val[0]:val[1]], nonsilent_ranges, AudioSegment.empty()) stripped.export(audio_path, format='mp3') class AudioFeature: def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH): self.audio_path = audio_path self.sr = sr self.hop_length = hop_length self.y = None self.y_harm, self.y_perc = None, None self.spectrogram = None self.rms = None self.melspectrogram = None self.mel_acts = None self.chromagram = None self.chroma_acts = None self.onset_env = None self.tempogram = None self.tempogram_acts = None self.mfccs = None self.mfcc_acts = None self.combined_features = None self.n_frames = None self.tempo = None self.beats = None self.meter_grid = None self.key, self.mode = None, None def detect_key(self, chroma_vals): note_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'] major_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88]) minor_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17]) major_profile /= np.linalg.norm(major_profile) minor_profile /= np.linalg.norm(minor_profile) major_correlations = [np.corrcoef(chroma_vals, np.roll(major_profile, i))[0, 1] for i in range(12)] minor_correlations = [np.corrcoef(chroma_vals, np.roll(minor_profile, i))[0, 1] for i in range(12)] max_major_idx = np.argmax(major_correlations) max_minor_idx = np.argmax(minor_correlations) self.mode = 'major' if major_correlations[max_major_idx] > minor_correlations[max_minor_idx] else 'minor' self.key = note_names[max_major_idx if self.mode == 'major' else max_minor_idx] return self.key, self.mode def calculate_ki_chroma(self, waveform, sr, hop_length): chromagram = librosa.feature.chroma_cqt(y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24) chromagram = (chromagram - chromagram.min()) / (chromagram.max() - chromagram.min()) chroma_vals = np.sum(chromagram, axis=1) key, mode = self.detect_key(chroma_vals) key_idx = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'].index(key) shift_amount = -key_idx if mode == 'major' else -(key_idx + 3) % 12 return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1) def extract_features(self): self.y, self.sr = librosa.load(self.audio_path, sr=self.sr) self.y_harm, self.y_perc = librosa.effects.hpss(self.y) self.spectrogram, _ = librosa.magphase(librosa.stft(self.y, hop_length=self.hop_length)) self.rms = librosa.feature.rms(S=self.spectrogram, hop_length=self.hop_length).astype(np.float32) self.melspectrogram = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_mels=128, hop_length=self.hop_length).astype(np.float32) self.mel_acts = librosa.decompose.decompose(self.melspectrogram, n_components=3, sort=True)[1].astype(np.float32) self.chromagram = self.calculate_ki_chroma(self.y_harm, self.sr, self.hop_length).astype(np.float32) self.chroma_acts = librosa.decompose.decompose(self.chromagram, n_components=4, sort=True)[1].astype(np.float32) self.onset_env = librosa.onset.onset_strength(y=self.y_perc, sr=self.sr, hop_length=self.hop_length) self.tempogram = np.clip(librosa.feature.tempogram(onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length), 0, np.percentile(self.tempogram, 99)).astype(np.float32) self.tempogram_acts = librosa.decompose.decompose(self.tempogram, n_components=3, sort=True)[1].astype(np.float32) self.mfccs = librosa.feature.mfcc(y=self.y, sr=self.sr, n_mfcc=13, hop_length=self.hop_length).astype(np.float32) self.mfcc_acts = librosa.decompose.decompose(self.mfccs, n_components=3, sort=True)[1].astype(np.float32) self.combined_features = np.vstack([self.rms, self.mel_acts, self.chroma_acts, self.tempogram_acts, self.mfcc_acts]) self.n_frames = self.combined_features.shape[1] self.tempo, self.beats = librosa.beat.beat_track(y=self.y_perc, sr=self.sr, hop_length=self.hop_length) self.meter_grid = librosa.util.fix_frames(librosa.util.frame(self.beats, frame_length=MAX_METERS, hop_length=1), x_min=0, x_max=self.n_frames) self.key, self.mode = self.detect_key(np.sum(self.chromagram, axis=1)) def get_features(self): self.extract_features() return self.combined_features, self.n_frames, self.tempo, self.beats, self.meter_grid, self.key, self.mode def load_model(model_path=MODEL_PATH): return tf.keras.models.load_model(model_path) def predict_chorus(audio_features, model): features, n_frames, tempo, beats, meter_grid, key, mode = audio_features.get_features() features = features[:, :MAX_FRAMES] features = np.expand_dims(features, axis=0) scaler = StandardScaler() features = scaler.fit_transform(features.reshape(-1, features.shape[-1])).reshape(features.shape) predictions = model.predict(features) return predictions def plot_predictions(predictions, title): plt.figure(figsize=(10, 4)) plt.plot(predictions[0], label='Chorus Probability') plt.title(title) plt.xlabel('Frame') plt.ylabel('Probability') plt.legend() st.pyplot(plt) def main(): st.title("Chorus Finder") st.write("Upload a YouTube URL to find the chorus in the song.") url = st.text_input("YouTube URL") if st.button("Find Chorus"): if url: audio_file, video_title = extract_audio(url) if audio_file: strip_silence(audio_file) audio_features = AudioFeature(audio_file) model = load_model() predictions = predict_chorus(audio_features, model) plot_predictions(predictions, video_title) shutil.rmtree(AUDIO_TEMP_PATH) else: st.error("Please enter a valid YouTube URL") if __name__ == "__main__": main()