dennisvdang
/

chorus-detection

Model card Files Files and versions Community

dennisvdang commited on Jun 15

Commit

856e7cd

•

1 Parent(s): 9fd273a

Upload 2 files

Browse files

Files changed (2) hide show

app.py +174 -0
requirements.txt +13 -0

app.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow logs (must be set before importing TensorFlow)
+import tensorflow as tf
+tf.get_logger().setLevel('ERROR')  # Suppress TensorFlow ERROR logs
+import warnings
+warnings.filterwarnings("ignore")  # Suppress all warnings
+import argparse
+from functools import reduce
+from typing import List, Tuple
+import shutil
+import librosa
+import numpy as np
+from matplotlib import pyplot as plt
+from pydub import AudioSegment
+from pydub.silence import detect_nonsilent
+from pytube import YouTube
+from sklearn.preprocessing import StandardScaler
+import shutil
+import streamlit as st
+# Constants
+SR = 12000
+HOP_LENGTH = 128
+MAX_FRAMES = 300
+MAX_METERS = 201
+N_FEATURES = 15
+MODEL_PATH = "models/CRNN/best_model_V3.h5"
+AUDIO_TEMP_PATH = "output/temp"
+def extract_audio(url, output_path=AUDIO_TEMP_PATH):
+    try:
+        yt = YouTube(url)
+        video_title = yt.title
+        audio_stream = yt.streams.filter(only_audio=True).first()
+        if audio_stream:
+            os.makedirs(output_path, exist_ok=True)
+            out_file = audio_stream.download(output_path)
+            base, _ = os.path.splitext(out_file)
+            audio_file = base + '.mp3'
+            if os.path.exists(audio_file):
+                os.remove(audio_file)
+            os.rename(out_file, audio_file)
+            return audio_file, video_title
+        else:
+            st.error("No audio stream found")
+            return None, None
+    except Exception as e:
+        st.error(f"An error occurred: {e}")
+        return None, None
+def strip_silence(audio_path):
+    sound = AudioSegment.from_file(audio_path)
+    nonsilent_ranges = detect_nonsilent(sound, min_silence_len=500, silence_thresh=-50)
+    stripped = reduce(lambda acc, val: acc + sound[val[0]:val[1]], nonsilent_ranges, AudioSegment.empty())
+    stripped.export(audio_path, format='mp3')
+class AudioFeature:
+    def __init__(self, audio_path, sr=SR, hop_length=HOP_LENGTH):
+        self.audio_path = audio_path
+        self.sr = sr
+        self.hop_length = hop_length
+        self.y = None
+        self.y_harm, self.y_perc = None, None
+        self.spectrogram = None
+        self.rms = None
+        self.melspectrogram = None
+        self.mel_acts = None
+        self.chromagram = None
+        self.chroma_acts = None
+        self.onset_env = None
+        self.tempogram = None
+        self.tempogram_acts = None
+        self.mfccs = None
+        self.mfcc_acts = None
+        self.combined_features = None
+        self.n_frames = None
+        self.tempo = None
+        self.beats = None
+        self.meter_grid = None
+        self.key, self.mode = None, None
+    def detect_key(self, chroma_vals):
+        note_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
+        major_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
+        minor_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
+        major_profile /= np.linalg.norm(major_profile)
+        minor_profile /= np.linalg.norm(minor_profile)
+        major_correlations = [np.corrcoef(chroma_vals, np.roll(major_profile, i))[0, 1] for i in range(12)]
+        minor_correlations = [np.corrcoef(chroma_vals, np.roll(minor_profile, i))[0, 1] for i in range(12)]
+        max_major_idx = np.argmax(major_correlations)
+        max_minor_idx = np.argmax(minor_correlations)
+        self.mode = 'major' if major_correlations[max_major_idx] > minor_correlations[max_minor_idx] else 'minor'
+        self.key = note_names[max_major_idx if self.mode == 'major' else max_minor_idx]
+        return self.key, self.mode
+    def calculate_ki_chroma(self, waveform, sr, hop_length):
+        chromagram = librosa.feature.chroma_cqt(y=waveform, sr=sr, hop_length=hop_length, bins_per_octave=24)
+        chromagram = (chromagram - chromagram.min()) / (chromagram.max() - chromagram.min())
+        chroma_vals = np.sum(chromagram, axis=1)
+        key, mode = self.detect_key(chroma_vals)
+        key_idx = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'].index(key)
+        shift_amount = -key_idx if mode == 'major' else -(key_idx + 3) % 12
+        return librosa.util.normalize(np.roll(chromagram, shift_amount, axis=0), axis=1)
+    def extract_features(self):
+        self.y, self.sr = librosa.load(self.audio_path, sr=self.sr)
+        self.y_harm, self.y_perc = librosa.effects.hpss(self.y)
+        self.spectrogram, _ = librosa.magphase(librosa.stft(self.y, hop_length=self.hop_length))
+        self.rms = librosa.feature.rms(S=self.spectrogram, hop_length=self.hop_length).astype(np.float32)
+        self.melspectrogram = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_mels=128, hop_length=self.hop_length).astype(np.float32)
+        self.mel_acts = librosa.decompose.decompose(self.melspectrogram, n_components=3, sort=True)[1].astype(np.float32)
+        self.chromagram = self.calculate_ki_chroma(self.y_harm, self.sr, self.hop_length).astype(np.float32)
+        self.chroma_acts = librosa.decompose.decompose(self.chromagram, n_components=4, sort=True)[1].astype(np.float32)
+        self.onset_env = librosa.onset.onset_strength(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
+        self.tempogram = np.clip(librosa.feature.tempogram(onset_envelope=self.onset_env, sr=self.sr, hop_length=self.hop_length), 0, np.percentile(self.tempogram, 99)).astype(np.float32)
+        self.tempogram_acts = librosa.decompose.decompose(self.tempogram, n_components=3, sort=True)[1].astype(np.float32)
+        self.mfccs = librosa.feature.mfcc(y=self.y, sr=self.sr, n_mfcc=13, hop_length=self.hop_length).astype(np.float32)
+        self.mfcc_acts = librosa.decompose.decompose(self.mfccs, n_components=3, sort=True)[1].astype(np.float32)
+        self.combined_features = np.vstack([self.rms, self.mel_acts, self.chroma_acts, self.tempogram_acts, self.mfcc_acts])
+        self.n_frames = self.combined_features.shape[1]
+        self.tempo, self.beats = librosa.beat.beat_track(y=self.y_perc, sr=self.sr, hop_length=self.hop_length)
+        self.meter_grid = librosa.util.fix_frames(librosa.util.frame(self.beats, frame_length=MAX_METERS, hop_length=1), x_min=0, x_max=self.n_frames)
+        self.key, self.mode = self.detect_key(np.sum(self.chromagram, axis=1))
+    def get_features(self):
+        self.extract_features()
+        return self.combined_features, self.n_frames, self.tempo, self.beats, self.meter_grid, self.key, self.mode
+def load_model(model_path=MODEL_PATH):
+    return tf.keras.models.load_model(model_path)
+def predict_chorus(audio_features, model):
+    features, n_frames, tempo, beats, meter_grid, key, mode = audio_features.get_features()
+    features = features[:, :MAX_FRAMES]
+    features = np.expand_dims(features, axis=0)
+    scaler = StandardScaler()
+    features = scaler.fit_transform(features.reshape(-1, features.shape[-1])).reshape(features.shape)
+    predictions = model.predict(features)
+    return predictions
+def plot_predictions(predictions, title):
+    plt.figure(figsize=(10, 4))
+    plt.plot(predictions[0], label='Chorus Probability')
+    plt.title(title)
+    plt.xlabel('Frame')
+    plt.ylabel('Probability')
+    plt.legend()
+    st.pyplot(plt)
+def main():
+    st.title("Chorus Finder")
+    st.write("Upload a YouTube URL to find the chorus in the song.")
+    url = st.text_input("YouTube URL")
+    if st.button("Find Chorus"):
+        if url:
+            audio_file, video_title = extract_audio(url)
+            if audio_file:
+                strip_silence(audio_file)
+                audio_features = AudioFeature(audio_file)
+                model = load_model()
+                predictions = predict_chorus(audio_features, model)
+                plot_predictions(predictions, video_title)
+                shutil.rmtree(AUDIO_TEMP_PATH)
+        else:
+            st.error("Please enter a valid YouTube URL")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+# Python version: >=3.7
+ffmpeg-python==0.2.0
+keras==2.1.0
+librosa==0.10.1
+matplotlib==3.7.2
+numpy==1.24.4
+pandas==2.0.3
+pydub==0.25.1
+pytube==15.0.0
+scikit-learn==1.3.0
+scipy==1.9.1
+streamlit
+tensorflow==2.1.0