Spaces:

karlopintaric
/

instrument-recognizer-api

Running

App Files Files Community

Karlo Pintaric commited on Aug 24, 2023

Commit

fdc1efd

•

1 Parent(s): 48fb9cc

Upload 25 files

Browse files

Files changed (26) hide show

.gitattributes +1 -0
DockerFile.backend +17 -0
setup.py +39 -0
src/__init__.py +0 -0
src/api/ModelService.py +93 -0
src/api/__init__.py +0 -0
src/api/main.py +133 -0
src/api/main_test.py +63 -0
src/api/models/acc_model_ast.pth +3 -0
src/api/models/acc_model_thresh.npy +3 -0
src/api/models/speed_model_ast.pth +3 -0
src/api/models/speed_model_thresh.npy +3 -0
src/api/test_files/test.wav +3 -0
src/frontend/.streamlit/config.toml +10 -0
src/frontend/__init__.py +0 -0
src/frontend/ui.py +97 -0
src/frontend/ui_backend.py +254 -0
src/modeling/__init__.py +2 -0
src/modeling/dataset.py +162 -0
src/modeling/learner.py +333 -0
src/modeling/loss.py +96 -0
src/modeling/metrics.py +179 -0
src/modeling/models.py +313 -0
src/modeling/preprocess.py +336 -0
src/modeling/transforms.py +398 -0
src/modeling/utils.py +336 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+src/api/test_files/test.wav filter=lfs diff=lfs merge=lfs -text

DockerFile.backend ADDED Viewed

	@@ -0,0 +1,17 @@

+# Use an official Python runtime as the base image
+FROM python:3.9-slim
+# Set the working directory in the container
+WORKDIR /app
+# Copy the setup.py file and the package directory into the container
+COPY ./setup.py .
+# Install the package and its dependencies
+COPY ./src ./src
+RUN pip install --no-cache-dir .[backend] torch==1.13.1+cpu torchvision==0.14.1+cpu torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cpu
+EXPOSE 7860
+CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "7860"]

setup.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from setuptools import setup, find_packages
+setup(
+    name="lumen-irmas",
+    version="0.1.0",
+    description="LUMEN Data Science nagradni zadatak",
+    author="Karlo Pintaric i Tatjana Cigula",
+    packages=find_packages(include=["src"]),
+    python_requires=">=3.9",
+    install_requires=[
+        "numpy==1.23.5",
+        "transformers==4.27.4",
+    ],
+    extras_require={
+        "backend": ["fastapi==0.95.1", "uvicorn==0.21.1", "pydantic==1.10.7", "python-multipart==0.0.6"],
+        "frontend": ["streamlit==1.21.0", "requests==2.28.2", "soundfile==0.12.1"],
+        "user": [
+            "lumen-irmas[backend]",
+            "lumen-irmas[frontend]",
+            "torch==1.13.1",
+            "torchaudio==0.13.1",
+            "torchvision==0.14.1",
+            ],
+        "dev": [
+            "lumen-irmas[user]",
+            "librosa==0.10.0.post2",
+            "pandas==1.5.3",
+            "scikit-learn==1.2.2",
+            "tqdm==4.65.0",
+            "wandb==0.14.2",
+            "pytest==7.3.1",
+            "joblib==1.2.0",
+            "PyYAML==6.0",
+            "flake8==6.0.0",
+            "isort== 5.12.0",
+            "black==23.3.0"
+            ]
+    },
+)

src/__init__.py ADDED Viewed

File without changes

src/api/ModelService.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from pathlib import Path
+import numpy as np
+import torch
+from torchvision import transforms
+from src.modeling import ASTPretrained, FeatureExtractor, PreprocessPipeline, StudentAST
+MODELS_FOLDER = Path(__file__).parent / "models"
+CLASSES = ["tru", "sax", "vio", "gac", "org", "cla", "flu", "voi", "gel", "cel", "pia"]
+def load_model(model_type: str):
+    """
+    Loads a pre-trained AST model of the specified type.
+    :param model_type: The type of model to load
+    :type model_type: str
+    :return: The loaded pre-trained AST model.
+    :rtype: ASTPretrained
+    """
+    if model_type == "accuracy":
+        model = ASTPretrained(n_classes=11, download_weights=False)
+        model.load_state_dict(torch.load(f"{MODELS_FOLDER}/acc_model_ast.pth", map_location=torch.device("cpu")))
+    else:
+        model = StudentAST(n_classes=11, hidden_size=192, num_heads=3)
+        model.load_state_dict(torch.load(f"{MODELS_FOLDER}/speed_model_ast.pth", map_location=torch.device("cpu")))
+    model.eval()
+    return model
+def load_labels():
+    """
+    Loads a dictionary of class labels for the AST model.
+    :return: A dictionary where the keys are the class indices and the values are the class labels.
+    :rtype: Dict[int, str]
+    """
+    labels = {i: CLASSES[i] for i in range(len(CLASSES))}
+    return labels
+def load_thresholds(model_type: str):
+    """
+    Loads the prediction thresholds for the AST model.
+    :return: The prediction thresholds for each class.
+    :rtype: np.ndarray
+    """
+    if model_type == "accuracy":
+        thresholds = np.load(f"{MODELS_FOLDER}/acc_model_thresh.npy", allow_pickle=True)
+    else:
+        thresholds = np.load(f"{MODELS_FOLDER}/speed_model_thresh.npy", allow_pickle=True)
+    return thresholds
+class ModelServiceAST:
+    def __init__(self, model_type: str):
+        """
+        Initializes a ModelServiceAST instance with the specified model type.
+        :param model_type: The type of model to load
+        :type model_type: str
+        """
+        self.model = load_model(model_type)
+        self.labels = load_labels()
+        self.thresholds = load_thresholds(model_type)
+        self.transform = transforms.Compose([PreprocessPipeline(target_sr=16000), FeatureExtractor(sr=16000)])
+    def get_prediction(self, audio):
+        """
+        Gets the binary predictions for the given audio file.
+        :param audio_file: The file object for the input audio to make predictions for.
+        :type audio_file: file object
+        :return: A dictionary where the keys are the class labels and the values are binary predictions (0 or 1).
+        :rtype: Dict[str, int]
+        """
+        processed = self.transform(audio)
+        with torch.no_grad():
+            # Don't forget to transpose the output to seq_len x num_features!!!
+            output = torch.sigmoid(self.model(processed.mT))
+            output = output.squeeze().numpy().astype(float)
+        binary_predictions = {}
+        for i, label in enumerate(CLASSES):
+            binary_predictions[label] = int(output[i] >= self.thresholds[i])
+        return binary_predictions

src/api/__init__.py ADDED Viewed

File without changes

src/api/main.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os
+import logging
+from logging.handlers import RotatingFileHandler
+from pathlib import Path
+from typing import Dict
+from fastapi import Depends, FastAPI, File, UploadFile
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse
+from src.api.ModelService import ModelServiceAST
+from pydantic import BaseModel, validator
+LOG_SAVE_DIR = Path(__file__).parent / "logs"
+if not os.path.exists(LOG_SAVE_DIR):
+    os.makedirs(LOG_SAVE_DIR)
+ml_models = {}
+ml_models["Accuracy"] = ModelServiceAST(model_type="accuracy")
+ml_models["Speed"] = ModelServiceAST(model_type="speed")
+app = FastAPI()
+# Define the allowed file formats and maximum file size (in bytes)
+ALLOWED_FILE_FORMATS = ["wav"]
+# Configure logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+# Create a rotating file handler to save logs to a file
+handler = RotatingFileHandler(f"{LOG_SAVE_DIR}/app.log", maxBytes=100000, backupCount=5)
+handler.setLevel(logging.DEBUG)
+# Define the log format
+formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+# Add the handler to the logger
+logger.addHandler(handler)
+class InvalidFileTypeError(Exception):
+    def __init__(self):
+        self.message = "Only wav files are supported"
+        super().__init__(self.message)
+class InvalidModelError(Exception):
+    def __init__(self):
+        self.message = "Selected model doesn't exist"
+        super().__init__(self.message)
+class MissingFileError(Exception):
+    def __init__(self):
+        self.message = "File cannot be None"
+        super().__init__(self.message)
+class PredictionRequest(BaseModel):
+    model_name: str
+    @validator("model_name")
+    @classmethod
+    def valid_model(cls, v):
+        if v not in ml_models.keys():
+            raise InvalidModelError
+        return v
+class PredictionResult(BaseModel):
+    prediction: Dict[str, Dict[str, int]]
+@app.exception_handler(RequestValidationError)
+def validation_exception_handler(request, ex):
+    logger.error(f"Request validation error: {ex}")
+    return JSONResponse(content={"error": "Bad Request", "detail": ex.errors()}, status_code=400)
+@app.exception_handler(InvalidFileTypeError)
+def filetype_exception_handler(request, ex):
+    logger.error(f"Invalid file type error: {ex}")
+    return JSONResponse(content={"error": "Bad Request", "detail": ex.message}, status_code=400)
+@app.exception_handler(InvalidModelError)
+def model_exception_handler(request, ex):
+    logger.error(f"Invalid model error: {ex}")
+    return JSONResponse(content={"error": "Bad Request", "detail": ex.message}, status_code=400)
+@app.exception_handler(MissingFileError)
+def handle_missing_file_error(request, ex):
+    logger.error(f"Missing file error: {ex}")
+    return JSONResponse(content={"error": "Bad Request", "detail": ex.message}, status_code=400)
+@app.exception_handler(Exception)
+def handle_exceptions(request, ex):
+    logger.exception(f"Internal server error: {ex}")
+    # If an exception occurs during processing, return a JSON response with an error message
+    return JSONResponse(content={"error": "Internal Server Error", "detail": str(ex)}, status_code=500)
+@app.get("/")
+def root():
+    logger.info("Received request to root endpoint")
+    return {"message": "Welcome to my API. Go to /docs to view the documentation."}
+@app.get("/health-check")
+def health_check():
+    """
+    Health check endpoint to verify if the API is running.
+    """
+    logger.info("Health check endpoint was hit")
+    return {"status": "API is running"}
+@app.post("/predict")
+def predict(request: PredictionRequest = Depends(), file: UploadFile = File(...)) -> PredictionResult:  # noqa
+    if not file:
+        raise MissingFileError
+    if file.filename.split(".")[-1].lower() not in ALLOWED_FILE_FORMATS:
+        raise InvalidFileTypeError
+    logger.info(f"Prediction request received: {request}")
+    output = ml_models[request.model_name].get_prediction(file.file)
+    logger.info(f"Prediction result: {output}")
+    prediction_result = PredictionResult(prediction={file.filename: output})
+    return prediction_result

src/api/main_test.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import io
+import sys
+from pathlib import Path
+import soundfile as sf
+from fastapi.testclient import TestClient
+sys.path.append(".")
+from src.api.main import app  # noqa
+TEST_FILES_DIR = Path(__file__).parent / "test_files"
+TEST_WAV_FILE = TEST_FILES_DIR / "test.wav"
+client = TestClient(app)
+def test_health_check():
+    response = client.get("/health-check")
+    assert response.status_code == 200
+    assert response.json() == {"status": "API is running"}
+def test_predict_valid_cut_file():
+    audio_data, sample_rate = sf.read(TEST_WAV_FILE)
+    audio_file = io.BytesIO()
+    sf.write(audio_file, audio_data, sample_rate, format="wav")
+    audio_file = ("test.wav", audio_file)
+    file = {"file": audio_file}
+    request_data = {"model_name": "Accuracy"}
+    # Make a request to the /predict endpoint
+    response = client.post("/predict", params=request_data, files=file)
+    # Check that the response is successful
+    assert response.status_code == 200
+    assert response.json()["prediction"]["test.wav"] is not None
+def test_predict_valid_file():
+    with open(TEST_WAV_FILE, "rb") as file:
+        data = {"model_name": "Accuracy"}
+        response = client.post("/predict", params=data, files={"file": file})
+        assert response.status_code == 200
+        assert response.json()["prediction"]["test.wav"] is not None
+def test_predict_invalid_file_type():
+    file_data = io.BytesIO(b"dummy txt data")
+    file = ("test.txt", file_data)
+    data = {"model_name": "Accuracy"}
+    response = client.post("/predict", params=data, files={"file": file})
+    assert response.status_code == 400
+    assert "Only wav files are supported" in response.json()["detail"]
+def test_predict_invalid_model():
+    file_data = io.BytesIO(b"dummy wav data")
+    file = ("test.wav", file_data)
+    data = {"model_name": "InvalidModel"}
+    response = client.post("/predict", params=data, files={"file": file})
+    assert response.status_code == 400
+    assert "Selected model doesn't exist" in response.json()["detail"]

src/api/models/acc_model_ast.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2305b1d04ed918b6d6428f86dfde162d6912b5021741ff58785fa7b020094ec0
+size 344860756

src/api/models/acc_model_thresh.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3034a1e953618280465b52b4104184b577e783afdf6231add9b96d119e12addf
+size 216

src/api/models/speed_model_ast.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e529b7b85881d249f455b5386cdb5306915ad34cd5fc5fafeca35fc965573637
+size 22573905

src/api/models/speed_model_thresh.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56838178f12bccc05cf5ffc92a7ff570a70d3a42f3f87c977ad8c9ae0f4a3359
+size 216

src/api/test_files/test.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60f854cc407877512a3e68a286cfd26e95dc2f0a4e76ba313fbb3e21ddf2d2f9
+size 3492764

src/frontend/.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,10 @@

+[theme]
+base = "dark"
+primaryColor = "#FFFFFF"
+backgroundColor = "#212121"
+secondaryBackgroundColor = "#757575"
+textColor = "#FFFFFF"
+font = "sans serif"
+[browser]
+gatherUsageStats = false

src/frontend/__init__.py ADDED Viewed

File without changes

src/frontend/ui.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import json
+import streamlit as st
+from ui_backend import (
+    check_for_api,
+    cut_audio_file,
+    display_predictions,
+    load_audio,
+    predict_multiple,
+    predict_single,
+)
+def main():
+    # Page settings
+    st.set_page_config(
+        page_title="Music Instrument Recognition", page_icon="🎸", layout="wide", initial_sidebar_state="collapsed"
+    )
+    # Sidebar
+    with st.sidebar:
+        st.title("⚙️ Settings")
+        selected_model = st.selectbox(
+            "Select Model",
+            ("Accuracy", "Speed"),
+            index=0,
+            help="Select a slower but more accurate model or a faster but less accurate model",
+        )
+    # Main title
+    st.markdown(
+        "<h1 style='text-align: center; color: #FFFFFF; font-size: 3rem;'>Instrument Recognition 🎶</h1>",
+        unsafe_allow_html=True,
+    )
+    # Upload widget
+    audio_file = load_audio()
+    # Send a health check request to the API in a loop until it is running
+    api_running = check_for_api(10)
+    # Enable or disable a button based on API status
+    predict_valid = False
+    cut_valid = False
+    if api_running:
+        st.info("API is running", icon="🤖")
+    if audio_file:
+        num_files = len(audio_file)
+        st.write(f"Number of uploaded files: {num_files}")
+        predict_valid = True
+        if len(audio_file) > 1:
+            cut_valid = False
+        else:
+            audio_file = audio_file[0]
+            cut_valid = True
+            name = audio_file.name
+    if cut_valid:
+        cut_audio = st.checkbox(
+            "✂️ Cut duration",
+            disabled=not predict_valid,
+            help="Cut a long audio file. Model works best if audio is around 15 seconds",
+        )
+        if cut_audio:
+            audio_file = cut_audio_file(audio_file, name)
+    result = st.button("Predict", disabled=not predict_valid, help="Send the audio to API to get a prediction")
+    if result:
+        predictions = {}
+        if isinstance(audio_file, list):
+            predictions = predict_multiple(audio_file, selected_model)
+        else:
+            predictions = predict_single(audio_file, name, selected_model)
+        # Sort the dictionary alphabetically by key
+        sorted_predictions = dict(sorted(predictions.items()))
+        # Convert the sorted dictionary to a JSON string
+        json_string = json.dumps(sorted_predictions)
+        st.download_button(
+            label="Download JSON",
+            file_name="predictions.json",
+            mime="application/json",
+            data=json_string,
+            help="Download the predictions in JSON format",
+        )
+        display_predictions(sorted_predictions)
+if __name__ == "__main__":
+    main()

src/frontend/ui_backend.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import io
+import os
+import time
+from json import JSONDecodeError
+import math
+import requests
+import soundfile as sf
+import streamlit as st
+if os.environ.get("IS_DOCKER", False):
+    backend = "http://api:7860"
+else:
+    backend = "http://0.0.0.0:7860"
+INSTRUMENTS = {
+    "tru": "Trumpet",
+    "sax": "Saxophone",
+    "vio": "Violin",
+    "gac": "Acoustic Guitar",
+    "org": "Organ",
+    "cla": "Clarinet",
+    "flu": "Flute",
+    "voi": "Voice",
+    "gel": "Electric Guitar",
+    "cel": "Cello",
+    "pia": "Piano",
+}
+def load_audio():
+    """
+    Upload a WAV audio file and display it in a Streamlit app.
+    :return: A BytesIO object representing the uploaded audio file, or None if no file was uploaded.
+    :rtype: Optional[BytesIO]
+    """
+    audio_file = st.file_uploader(label="Upload audio file", type="wav", accept_multiple_files=True)
+    if len(audio_file) > 0:
+        st.audio(audio_file[0])
+        return audio_file
+    else:
+        return None
+@st.cache_data(show_spinner=False)
+def check_for_api(max_tries: int):
+    """
+    Check if the API is running by making a health check request.
+    :param max_tries: The maximum number of attempts to check the API's health.
+    :type max_tries: int
+    :return: True if the API is running, False otherwise.
+    :rtype: bool
+    """
+    trial_count = 0
+    with st.spinner("Waiting for API..."):
+        while trial_count <= max_tries:
+            try:
+                response = health_check()
+                if response:
+                    return True
+            except requests.exceptions.ConnectionError:
+                trial_count += 1
+                # Handle connection error, e.g. API not yet running
+                time.sleep(5)  # Sleep for 1 second before retrying
+        st.error("API is not running. Please refresh the page to try again.", icon="🚨")
+        st.stop()
+def cut_audio_file(audio_file, name):
+    """
+    Cut an audio file and return the cut audio data as a tuple.
+    :param audio_file: The path of the audio file to be cut.
+    :type audio_file: str
+    :param name: The name of the audio file to be cut.
+    :type name: str
+    :raises RuntimeError: If the audio file cannot be read.
+    :return: A tuple containing the name and the cut audio data as a BytesIO object.
+    :rtype: tuple
+    """
+    try:
+        audio_data, sample_rate = sf.read(audio_file)
+    except RuntimeError as e:
+        raise e
+    # Display audio duration
+    duration = round(len(audio_data) / sample_rate, 2)
+    st.info(f"Audio Duration: {duration} seconds")
+    # Get start and end time for cutting
+    start_time = st.number_input("Start Time (seconds)", min_value=0.0, max_value=duration - 1, step=0.1)
+    end_time = st.number_input("End Time (seconds)", min_value=start_time, value=duration, max_value=duration, step=0.1)
+    # Convert start and end time to sample indices
+    start_sample = int(start_time * sample_rate)
+    end_sample = int(end_time * sample_rate)
+    # Cut audio
+    cut_audio_data = audio_data[start_sample:end_sample]
+    # Create a temporary in-memory file for cut audio
+    audio_file = io.BytesIO()
+    sf.write(audio_file, cut_audio_data, sample_rate, format="wav")
+    # Display cut audio
+    st.audio(audio_file, format="audio/wav")
+    audio_file = (name, audio_file)
+    return audio_file
+def display_predictions(predictions: dict):
+    """
+    Display the predictions using instrument names instead of codes.
+    :param predictions: A dictionary containing the filenames and instruments detected in them.
+    :type predictions: dict
+    """
+    # Display the results using instrument names instead of codes
+    for filename, instruments in predictions.items():
+        st.subheader(filename)
+        if isinstance(instruments, str):
+            st.write(instruments)
+        else:
+            with st.container():
+                col1, col2 = st.columns([1, 3])
+                present_instruments = [
+                    INSTRUMENTS[instrument_code] for instrument_code, presence in instruments.items() if presence
+                ]
+                if present_instruments:
+                    for instrument_name in present_instruments:
+                        with col1:
+                            st.write(instrument_name)
+                        with col2:
+                            st.write("✔️")
+                else:
+                    st.write("No instruments found in this file.")
+def health_check():
+    """
+    Sends a health check request to the API and checks if it's running.
+    :return: Returns True if the API is running, else False.
+    :rtype: bool
+    """
+    # Send a health check request to the API
+    response = requests.get(f"{backend}/health-check", timeout=100)
+    # Check if the API is running
+    if response.status_code == 200:
+        return True
+    else:
+        return False
+def predict(data, model_name):
+    """
+    Sends a POST request to the API with the provided data and model name.
+    :param data: The audio data to be used for prediction.
+    :type data: bytes
+    :param model_name: The name of the model to be used for prediction.
+    :type model_name: str
+    :return: The response from the API.
+    :rtype: requests.Response
+    """
+    file = {"file": data}
+    request_data = {"model_name": model_name}
+    response = requests.post(
+        f"{backend}/predict", params=request_data, files=file, timeout=300
+    )  # Replace with your API endpoint URL
+    return response
+@st.cache_data(show_spinner=False)
+def predict_single(audio_file, name, selected_model):
+    """
+    Predicts the instruments in a single audio file using the selected model.
+    :param audio_file: The audio file to be used for prediction.
+    :type audio_file: bytes
+    :param name: The name of the audio file.
+    :type name: str
+    :param selected_model: The name of the selected model.
+    :type selected_model: str
+    :return: A dictionary containing the predicted instruments for the audio file.
+    :rtype: dict
+    """
+    predictions = {}
+    with st.spinner("Predicting instruments..."):
+        response = predict(audio_file, selected_model)
+        if response.status_code == 200:
+            prediction = response.json()["prediction"]
+            predictions[name] = prediction.get(name, "Error making prediction")
+        else:
+            st.write(response)
+            try:
+                st.json(response.json())
+            except JSONDecodeError:
+                st.error(response.text)
+            st.stop()
+    return predictions
+@st.cache_data(show_spinner=False)
+def predict_multiple(audio_files, selected_model):
+    """
+    Generates predictions for multiple audio files using the selected model.
+    :param audio_files: A list of audio files to make predictions on.
+    :type audio_files: List[UploadedFile]
+    :param selected_model: The model to use for making predictions.
+    :type selected_model: str
+    :return: A dictionary where the keys are the names of the audio files and the values are the predicted labels.
+    :rtype: Dict[str, str]
+    """
+    predictions = {}
+    progress_text = "Getting predictions for all files. Please wait."
+    progress_bar = st.empty()
+    progress_bar.progress(0, text=progress_text)
+    num_files = len(audio_files)
+    for i, file in enumerate(audio_files):
+        name = file.name
+        response = predict(file, selected_model)
+        if response.status_code == 200:
+            prediction = response.json()["prediction"]
+            predictions[name] = prediction[name]
+            progress_bar.progress((i + 1) / num_files, text=progress_text)
+        else:
+            predictions[name] = "Error making prediction."
+    progress_bar.empty()
+    return predictions
+if __name__ == "__main__":
+    pass

src/modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from src.modeling.models import ASTPretrained, StudentAST
2	+ from src.modeling.transforms import FeatureExtractor, PreprocessPipeline

src/modeling/dataset.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from pathlib import Path
+from typing import List, Optional, Tuple, Type, Union
+import numpy as np
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset
+from torchvision.transforms import Compose
+import modeling.transforms as transform_module
+from modeling.transforms import (
+    LabelsFromTxt,
+    OneHotEncode,
+    ParentMultilabel,
+    Preprocess,
+    Transform,
+)
+from modeling.utils import CLASSES, get_wav_files, init_obj, init_transforms
+class IRMASDataset(Dataset):
+    """Dataset class for IRMAS dataset.
+    :param audio_dir: Directory containing the audio files
+    :type audio_dir: Union[str, Path]
+    :param preprocess: Preprocessing method to apply to the audio files
+    :type preprocess: Type[Preprocess]
+    :param signal_augments: Signal augmentation method to apply to the audio files, defaults to None
+    :type signal_augments: Optional[Union[Type[Compose], Type[Transform]]], optional
+    :param transforms: Transform method to apply to the audio files, defaults to None
+    :type transforms: Optional[Union[Type[Compose], Type[Transform]]], optional
+    :param spec_augments: Spectrogram augmentation method to apply to the audio files, defaults to None
+    :type spec_augments: Optional[Union[Type[Compose], Type[Transform]]], optional
+    :param subset: Subset of the data to load (train, valid, or test), defaults to "train"
+    :type subset: str, optional
+    :raises AssertionError: Raises an assertion error if subset is not train, valid or test
+    :raises OSError: Raises an OS error if test_songs.txt is not found in the data folder
+    :return: A tuple of the preprocessed audio signal and the corresponding one-hot encoded label
+    :rtype: Tuple[Tensor, Tensor]
+    """
+    def __init__(
+        self,
+        audio_dir: Union[str, Path],
+        preprocess: Type[Preprocess],
+        signal_augments: Optional[Union[Type[Compose], Type[Transform]]] = None,
+        transforms: Optional[Union[Type[Compose], Type[Transform]]] = None,
+        spec_augments: Optional[Union[Type[Compose], Type[Transform]]] = None,
+        subset: str = "train",
+    ):
+        self.files = get_wav_files(audio_dir)
+        assert subset in ["train", "valid", "test"], "Subset can only be train, valid or test"
+        self.subset = subset
+        if self.subset != "train":
+            try:
+                test_songs = np.genfromtxt("../data/test_songs.txt", dtype=str, ndmin=1, delimiter="\n")
+            except OSError as e:
+                print("Error: {e}")
+                print("test_songs.txt not found in data/. Please generate a split before training")
+                raise e
+        if self.subset == "valid":
+            self.files = [file for file in self.files if Path(file).stem not in test_songs]
+        if self.subset == "test":
+            self.files = [file for file in self.files if Path(file).stem in test_songs]
+        self.preprocess = preprocess
+        self.transforms = transforms
+        self.signal_augments = signal_augments
+        self.spec_augments = spec_augments
+    def __len__(self):
+        """Return the length of the dataset.
+        :return: The length of the dataset
+        :rtype: int
+        """
+        return len(self.files)
+    def __getitem__(self, index):
+        """Get an item from the dataset.
+        :param index: The index of the item to get
+        :type index: int
+        :return: A tuple of the preprocessed audio signal and the corresponding one-hot encoded label
+        :rtype: Tuple[Tensor, Tensor]
+        """
+        sample_path = self.files[index]
+        signal = self.preprocess(sample_path)
+        if self.subset == "train":
+            target_transforms = Compose([ParentMultilabel(sep="-"), OneHotEncode(CLASSES)])
+        else:
+            target_transforms = Compose([LabelsFromTxt(), OneHotEncode(CLASSES)])
+        label = target_transforms(sample_path)
+        if self.signal_augments is not None and self.subset == "train":
+            signal = self.signal_augments(signal)
+        if self.transforms is not None:
+            signal = self.transforms(signal)
+        if self.spec_augments is not None and self.subset == "train":
+            signal = self.spec_augments(signal)
+        return signal, label.float()
+def collate_fn(data: List[Tuple[torch.Tensor, torch.Tensor]]):
+    """
+    Function to collate a batch of audio signals and their corresponding labels.
+    :param data: A list of tuples containing the audio signals and their corresponding labels.
+    :type data: List[Tuple[torch.Tensor, torch.Tensor]]
+    :return: A tuple containing the batch of audio signals and their corresponding labels.
+    :rtype: Tuple[torch.Tensor, torch.Tensor]
+    """
+    features, labels = zip(*data)
+    features = [item.squeeze().T for item in features]
+    # Pads items to same length if they're not
+    features = pad_sequence(features, batch_first=True)
+    labels = torch.stack(labels)
+    return features, labels
+def get_loader(config: dict, subset: str):
+    """
+    Function to create a PyTorch DataLoader for a given subset of the IRMAS dataset.
+    :param config: A configuration object.
+    :type config: Any
+    :param subset: The subset of the dataset to use. Can be "train" or "valid".
+    :type subset: str
+    :return: A PyTorch DataLoader for the specified subset of the dataset.
+    :rtype: torch.utils.data.DataLoader
+    """
+    dst = IRMASDataset(
+        config.train_dir if subset == "train" else config.valid_dir,
+        preprocess=init_obj(config.preprocess, transform_module),
+        transforms=init_obj(config.transforms, transform_module),
+        signal_augments=init_transforms(config.signal_augments, transform_module),
+        spec_augments=init_transforms(config.spec_augments, transform_module),
+        subset=subset,
+    )
+    return DataLoader(
+        dst,
+        batch_size=config.batch_size,
+        shuffle=True if subset == "train" else False,
+        pin_memory=True if torch.cuda.is_available() else False,
+        num_workers=torch.get_num_threads() - 1,
+        collate_fn=collate_fn,
+    )

src/modeling/learner.py ADDED Viewed

	@@ -0,0 +1,333 @@

+from abc import ABC, abstractmethod
+from typing import Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import wandb
+from torch.utils.data import DataLoader
+from tqdm.autonotebook import tqdm
+import modeling.loss as loss_module
+import modeling.metrics as metrics_module
+from modeling.loss import HardDistillationLoss
+from modeling.models import freeze, layerwise_lr_decay
+from modeling.utils import init_obj
+class BaseLearner(ABC):
+    """
+    Abstract base class for a learner.
+    :param train_dl: DataLoader for training data
+    :type train_dl: Type[DataLoader]
+    :param valid_dl: DataLoader for validation data
+    :type valid_dl: Type[DataLoader]
+    :param model: Model to be trained
+    :type model: Type[nn.Module]
+    :param config: Configuration object
+    :type config: Any
+    """
+    def __init__(self, train_dl: DataLoader, valid_dl: DataLoader, model: nn.Module, config):
+        self.train_dl = train_dl
+        self.valid_dl = valid_dl
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = model.to(self.device)
+        self.config = config
+    @abstractmethod
+    def fit(
+        self,
+    ):
+        """Abstract method for fitting the model."""
+        pass
+    @abstractmethod
+    def _train_epoch(
+        self,
+    ):
+        """Abstract method for training the model for one epoch."""
+        pass
+    @abstractmethod
+    def _test_epoch(
+        self,
+    ):
+        """Abstract method for testing the model for one epoch."""
+        pass
+class Learner(BaseLearner):
+    def __init__(self, train_dl: DataLoader, valid_dl: DataLoader, model: nn.Module, config):
+        """
+        A class that inherits from the BaseLearner class and represents a learner object.
+        :param train_dl: DataLoader for training data
+        :type train_dl: DataLoader
+        :param valid_dl: DataLoader for validation data
+        :type valid_dl: DataLoader
+        :param model: Model to be trained
+        :type model: nn.Module
+        :param config: Configuration object
+        :type config: Any
+        """
+        super().__init__(train_dl, valid_dl, model, config)
+        self.model = torch.nn.DataParallel(module=self.model, device_ids=list(range(config.num_gpus)))
+        self.loss_fn = init_obj(self.config.loss, loss_module)
+        params = layerwise_lr_decay(self.config, self.model)
+        self.optimizer = init_obj(self.config.optimizer, optim, params)
+        self.scheduler = init_obj(
+            self.config.scheduler,
+            optim.lr_scheduler,
+            self.optimizer,
+            max_lr=[param["lr"] for param in params],
+            epochs=self.config.epochs,
+            steps_per_epoch=int(np.ceil(len(train_dl) / self.config.num_accum)),
+        )
+        self.verbose = self.config.verbose
+        self.metrics = MetricTracker(self.config.metrics, self.verbose)
+        self.scaler = torch.cuda.amp.GradScaler()
+        self.train_step = 0
+        self.test_step = 0
+    def fit(self, model_name: str = "model"):
+        """
+        Method to train the model.
+        :param model_name: Name of the model to be saved, defaults to "model"
+        :type model_name: str, optional
+        """
+        loop = tqdm(range(self.config.epochs), leave=False)
+        for epoch in loop:
+            train_loss = self._train_epoch()
+            val_loss = self._test_epoch()
+            wandb.log({"train_loss": train_loss, "val_loss": val_loss, "epoch": epoch + 1})
+            if self.verbose:
+                print(f"| EPOCH: {epoch+1} | train_loss: {train_loss:.3f} | val_loss: {val_loss:.3f} |\n")
+                self.metrics.display()
+        if self.config.save_last_checkpoint:
+            torch.save(self.model.module.state_dict(), f"{model_name}.pth")
+    def _train_epoch(self, distill: bool = False):
+        """
+        Method to perform one epoch of training.
+        :param distill: Flag to indicate if knowledge distillation is used, defaults to False
+        :type distill: bool, optional
+        :return: Average training loss for the epoch
+        :rtype: float
+        """
+        if distill:
+            print("Distilling knowledge...", flush=True)
+        loop = tqdm(self.train_dl, leave=False)
+        self.model.train()
+        num_batches = len(self.train_dl)
+        train_loss = 0
+        for idx, (xb, yb) in enumerate(loop):
+            xb = xb.to(self.device)
+            yb = yb.to(self.device)
+            # forward
+            with torch.autocast(device_type=self.device, dtype=torch.float16, enabled=not distill):
+                predictions = self.model(xb)
+                if distill:
+                    loss = self.KDloss_fn(xb, predictions, yb)
+                else:
+                    loss = self.loss_fn(predictions, yb)
+                loss /= self.config.num_accum
+            # backward
+            self.scaler.scale(loss).backward()
+            wandb.log({f"lr_param_group_{i}": lr for i, lr in enumerate(self.scheduler.get_last_lr())})
+            if ((idx + 1) % self.config.num_accum == 0) or (idx + 1 == num_batches):
+                self.scaler.step(self.optimizer)
+                self.scaler.update()
+                self.scheduler.step()
+                self.optimizer.zero_grad()
+            # update loop
+            loop.set_postfix(loss=loss.item())
+            self.train_step += 1
+            wandb.log({"train_loss_per_batch": loss.item(), "train_step": self.train_step})
+            train_loss += loss.item()
+            if distill:
+                if ((idx + 1) % 2500 == 0) and not (idx + 1 == num_batches):
+                    val_loss = self._test_epoch()
+                    wandb.log({"val_loss": val_loss})
+                    self.model.train()
+        train_loss /= num_batches
+        return train_loss
+    def _test_epoch(self):
+        """
+        Method to perform one epoch of validation/testing.
+        :return: Average validation/test loss for the epoch
+        :rtype: float
+        """
+        loop = tqdm(self.valid_dl, leave=False)
+        self.model.eval()
+        num_batches = len(self.valid_dl)
+        preds = []
+        targets = []
+        test_loss = 0
+        with torch.no_grad():
+            for xb, yb in loop:
+                xb, yb = xb.to(self.device), yb.to(self.device)
+                pred = self.model(xb)
+                loss = self.loss_fn(pred, yb).item()
+                self.test_step += 1
+                wandb.log({"valid_loss_per_batch": loss, "test_step": self.test_step})
+                test_loss += loss
+                pred = torch.sigmoid(pred)
+                preds.extend(pred.cpu().numpy())
+                targets.extend(yb.cpu().numpy())
+        preds, targets = np.array(preds), np.array(targets)
+        self.metrics.update(preds, targets)
+        test_loss /= num_batches
+        return test_loss
+class KDLearner(Learner):
+    """
+    Knowledge Distillation Learner class for training a student model with knowledge distillation.
+    :param train_dl: Train data loader
+    :type train_dl: DataLoader
+    :param valid_dl: Validation data loader
+    :type valid_dl: DataLoader
+    :param student_model: Student model to be trained
+    :type student_model: nn.Module
+    :param teacher: Teacher model for knowledge distillation
+    :type teacher: nn.Module
+    :param thresholds: Thresholds for HardDistillationLoss
+    :type thresholds: List[float]
+    :param config: Configuration object for training
+    :type config: Config
+    """
+    def __init__(self, train_dl, valid_dl, student_model, teacher, thresholds, config):
+        super().__init__(train_dl, valid_dl, student_model, config)
+        self.teacher = nn.DataParallel(freeze(teacher).to(self.device))
+        self.KDloss_fn = HardDistillationLoss(self.teacher, self.loss_fn, thresholds, self.device)
+        self.scaler = torch.cuda.amp.GradScaler(enabled=False)
+    def _train_epoch(self):
+        """
+        Method to perform one epoch of training with knowledge distillation.
+        :return: Average training loss for the epoch
+        :rtype: float
+        """
+        return super()._train_epoch(distill=True)
+class MetricTracker:
+    """
+    Metric Tracker class for tracking evaluation metrics during model validation.
+    This class is used to track and display evaluation metrics during model validation.
+    It keeps track of the results of the provided metric functions for each validation batch,
+    and logs them to Weights & Biases using wandb.log(). The display() method can be used
+    to print the tracked metric results, if verbose is set to True during initialization.
+    :param metrics: List of metric functions to track
+    :type metrics: List[Callable]
+    :param verbose: Flag to indicate whether to print the results or not, defaults to True
+    :type verbose: bool, optional
+    """
+    def __init__(self, metrics, verbose: bool = True):
+        self.metrics_fn = [getattr(metrics_module, metric) for metric in metrics]
+        self.verbose = verbose
+        self.result = None
+    def update(self, preds, targets):
+        """
+        Update the metric tracker with the latest predictions and targets.
+        :param preds: Model predictions
+        :type preds: torch.Tensor
+        :param targets: Ground truth targets
+        :type targets: torch.Tensor
+        """
+        self.result = {metric.__name__: metric(preds, targets) for metric in self.metrics_fn}
+        wandb.log(self.result)
+    def display(self):
+        """Display the tracked metric results."""
+        for k, v in self.result.items():
+            print(f"{k}: {v:.2f}")
+def get_preds(data: DataLoader, model: nn.Module, device: str = "cpu") -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Get predictions and targets from a data loader and a PyTorch model.
+    :param data: A PyTorch DataLoader containing the data to predict on.
+    :type data: torch.utils.data.DataLoader
+    :param model: A PyTorch model to use for predictions.
+    :type model: torch.nn.Module
+    :param device: The device to use for predictions (default is "cpu").
+    :type device: str
+    :raises TypeError: If any of the input arguments is of an incorrect type.
+    :return: A tuple containing two NumPy arrays: the predictions and the targets.
+    :rtype: Tuple[numpy.ndarray, numpy.ndarray]
+    """
+    if not isinstance(data, DataLoader):
+        raise TypeError("The 'data' argument must be a PyTorch DataLoader.")
+    if not isinstance(model, nn.Module):
+        raise TypeError("The 'model' argument must be a PyTorch model.")
+    if not isinstance(device, str):
+        raise TypeError("The 'device' argument must be a string.")
+    loop = tqdm(data, leave=False)
+    model = model.to(device)
+    model.eval()
+    preds = []
+    targets = []
+    with torch.no_grad():
+        for xb, yb in loop:
+            xb, yb = xb.to(device), yb.to(device)
+            pred = model(xb)
+            pred = torch.sigmoid(pred)
+            preds.extend(pred.cpu().numpy())
+            targets.extend(yb.cpu().numpy())
+    preds, targets = np.array(preds), np.array(targets)
+    return preds, targets

src/modeling/loss.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from functools import partial
+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+from torchvision.ops import sigmoid_focal_loss
+class FocalLoss(nn.Module):
+    """
+    Focal Loss implementation.
+    This class defines the Focal Loss, which is a variant of the Binary Cross Entropy (BCE) loss that is
+    designed to address the problem of class imbalance in binary classification tasks.
+    The Focal Loss introduces two hyperparameters, alpha and gamma, to control the balance between easy
+    and hard examples during training.
+    :param alpha: The balancing parameter between positive and negative examples. A float value between 0 and 1.
+        If set to -1, no balancing is applied. Default is 0.25.
+    :type alpha: float
+    :param gamma: The focusing parameter to control the emphasis on hard examples. A positive integer. Default is 2.
+    :type gamma: int
+    """
+    def __init__(self, alpha: float = 0.25, gamma: int = 2):
+        super().__init__()
+        self.loss_fn = partial(sigmoid_focal_loss, alpha=alpha, gamma=gamma, reduction="mean")
+    def forward(self, inputs, targets):
+        """
+        Compute the Focal Loss.
+        :param inputs: The predicted inputs from the model.
+        :type inputs: torch.Tensor
+        :param targets: The ground truth targets.
+        :type targets: torch.Tensor
+        :return: The computed Focal Loss.
+        :rtype: torch.Tensor
+        :raises ValueError: If the inputs and targets have different shapes.
+        """
+        return self.loss_fn(inputs=inputs, targets=targets)
+class HardDistillationLoss(nn.Module):
+    """Hard Distillation Loss implementation.
+    This class defines the Hard Distillation Loss, which is used for model distillation,
+    a technique used to transfer knowledge from a large, complex teacher model to a smaller,
+    simpler student model. The Hard Distillation Loss computes the loss by comparing the outputs
+    of the student model and the teacher model using a provided loss function. It also introduces a
+    threshold parameter to convert the teacher model outputs to binary labels for the distillation process.
+    :param teacher: The teacher model used for distillation.
+    :type teacher: torch.nn.Module
+    :param loss_fn: The loss function used for computing the distillation loss.
+    :type loss_fn: torch.nn.Module
+    :param threshold: The threshold value used to convert teacher model outputs to binary labels.
+        Can be a list or numpy array of threshold values.
+    :type threshold: Union[list, np.array]
+    :param device: The device to be used for computation. Default is "cuda".
+    :type device: str
+    """
+    def __init__(self, teacher: nn.Module, loss_fn: nn.Module, threshold: Union[list, np.array], device: str = "cuda"):
+        super().__init__()
+        self.teacher = teacher
+        self.loss_fn = loss_fn
+        self.threshold = torch.tensor(threshold).to(device)
+    def forward(self, inputs, student_outputs, targets):
+        """
+        Compute the Hard Distillation Loss.
+        :param inputs: The input data fed to the student model.
+        :type inputs: torch.Tensor
+        :param student_outputs: The output predictions from the student model, which consists of
+            both classification and distillation outputs.
+        :type student_outputs: tuple
+        :param targets: The ground truth targets.
+        :type targets: torch.Tensor
+        :return: The computed Hard Distillation Loss.
+        :rtype: torch.Tensor
+        :raises ValueError: If the inputs and targets have different shapes.
+        """
+        outputs_cls, outputs_dist = student_outputs
+        teacher_outputs = torch.sigmoid(self.teacher(inputs))
+        teacher_labels = (teacher_outputs > self.threshold).float()
+        base_loss = self.loss_fn(outputs_cls, targets)
+        teacher_loss = self.loss_fn(outputs_dist, teacher_labels)
+        return (base_loss + teacher_loss) / 2

src/modeling/metrics.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import numpy as np
+from sklearn.metrics import (
+    accuracy_score,
+    average_precision_score,
+    f1_score,
+    hamming_loss,
+    precision_recall_curve,
+    zero_one_loss,
+)
+def hamming_score(preds, targets, thresholds: np.array = None):
+    """Compute Hamming Score.
+    This function computes the Hamming Score, a performance metric used for multi-label classification tasks.
+    The Hamming Score measures the similarity between the predicted labels and the ground truth labels, where
+    a higher score indicates better prediction accuracy.
+    :param preds: The predicted labels.
+    :type preds: numpy array
+    :param targets: The ground truth labels.
+    :type targets: numpy array
+    :return: The computed Hamming Score.
+    :rtype: int
+    """
+    if thresholds is None:
+        thresholds = optimize_accuracy(preds, targets)
+    preds = (preds > thresholds).astype(int)
+    return 1 - hamming_loss(targets, preds)
+def zero_one_score(preds, targets, thresholds: np.array = None):
+    """
+    Compute Zero-One Score.
+    This function computes the Zero-One Score, a performance metric used for
+    multi-label classification tasks. The Zero-One Score measures the similarity
+    between the predicted labels and the ground truth labels, where a higher score
+    indicates better prediction accuracy. The Zero-One Score ranges from 0 to 1, with 1 being a perfect match.
+    :param preds: The predicted labels.
+    :type preds: numpy array
+    :param targets: The ground truth labels.
+    :type targets: numpy array
+    :return: The computed Zero-One Score.
+    :rtype: int
+    """
+    if thresholds is None:
+        thresholds = optimize_accuracy(preds, targets)
+    preds = (preds > thresholds).astype(int)
+    return 1 - zero_one_loss(targets, preds, normalize=True)
+def mean_f1_score(preds, targets, thresholds: np.array = None):
+    """Compute Mean F1 Score.
+    This function computes the Mean F1 Score, a performance metric used for multi-label
+    classification tasks. The Mean F1 Score measures the trade-off between precision and recall,
+    where a higher score indicates better prediction accuracy. The Mean F1 Score ranges from
+    0 to 1, with 1 being a perfect match.
+    :param preds: The predicted labels.
+    :type preds: numpy array
+    :param targets: The ground truth labels.
+    :type targets: numpy array
+    :return: The computed Mean F1 Score.
+    :rtype: int
+    """
+    if thresholds is None:
+        thresholds = optimize_f1_score(preds, targets)
+    preds = (preds > thresholds).astype(int)
+    return f1_score(targets, preds, average="samples", zero_division=0)
+def per_instr_f1_score(preds, targets, thresholds: np.array = None):
+    """Compute Per-Instrument F1 Score.
+    This function computes the F1 Score for each instrument separately in a multi-label
+    classification task. The Per-Instrument F1 Score measures the prediction accuracy for
+    each instrument class independently. The F1 Score is the harmonic mean of precision and recall,
+    where a higher score indicates better prediction accuracy. The Per-Instrument F1 Score ranges
+    from 0 to 1, with 1 being a perfect match.
+    :param preds: The predicted labels.
+    :type preds: numpy array
+    :param targets: The ground truth labels.
+    :type targets: numpy array
+    :return: The computed Per-Instrument F1 Score.
+    :rtype: numpy array
+    """
+    if thresholds is None:
+        thresholds = optimize_f1_score(preds, targets)
+    preds = (preds > thresholds).astype(int)
+    return f1_score(targets, preds, average=None, zero_division=0)
+def mean_average_precision(preds, targets):
+    """
+    Compute mean Average Precision (mAP).
+    This function computes the mean Average Precision (mAP), a performance metric used
+    for multi-label classification tasks. The mAP measures the average precision across
+    all classes, taking into account the precision-recall trade-off, where a higher score
+    indicates better prediction accuracy.
+    :param preds: The predicted probabilities or scores.
+    :type preds: numpy array
+    :param targets: The ground truth labels.
+    :type targets: numpy array
+    :return: The computed mAP score.
+    :rtype: int
+    """
+    return average_precision_score(targets, preds, average="samples")
+def optimize_f1_score(preds, targets):
+    """
+    Optimize Threshold.
+    This function optimizes the threshold for binary classification based on the predicted probabilities
+    and ground truth labels. It computes the precision, recall, and F1 Score for each class separately
+    using the precision_recall_curve function from sklearn.metrics module. It then selects the threshold
+    that maximizes the F1 Score for each class.
+    :param preds: The predicted probabilities.
+    :type preds: numpy array
+    :param targets: The ground truth labels.
+    :type targets: numpy array
+    :return: The optimized thresholds for binary classification.
+    :rtype: numpy array
+    """
+    label_thresholds = np.empty(preds.shape[1])
+    for i in range(preds.shape[1]):
+        precision, recall, thresholds = precision_recall_curve(targets[:, i], preds[:, i])
+        fscore = (2 * precision * recall) / (precision + recall)
+        ix = np.argmax(fscore)
+        best_thresh = thresholds[ix]
+        label_thresholds[i] = best_thresh
+    return label_thresholds
+def optimize_accuracy(preds, targets):
+    """
+    Determine the optimal threshold for each label, based on the predicted probabilities and the true targets,
+    in order to maximize the accuracy of the predictions.
+    :param preds: A 2D NumPy array containing the predicted probabilities for each label.
+    :type preds: numpy.ndarray
+    :param targets: A 2D NumPy array containing the true binary targets for each label.
+    :type targets: numpy.ndarray
+    :raises ValueError: If the input arrays are not 2D arrays or have incompatible shapes.
+    :return: A 1D NumPy array containing the optimal threshold for each label.
+    :rtype: numpy.ndarray
+    """
+    # Vary the threshold for each label and calculate accuracy for each threshold
+    thresholds = np.arange(0.0001, 1, 0.0001)
+    best_thresholds = np.empty(preds.shape[1])
+    for i in range(preds.shape[1]):
+        accuracies = []
+        for th in thresholds:
+            y_pred = (preds[:, i] >= th).astype(int)  # Convert probabilities to binary predictions using the threshold
+            acc = accuracy_score(targets[:, i], y_pred)
+            accuracies.append(acc)
+        # Find the threshold that gives the highest accuracy for this label
+        best_idx = np.argmax(accuracies)
+        best_thresholds[i] = thresholds[best_idx]
+    return best_thresholds

src/modeling/models.py ADDED Viewed

	@@ -0,0 +1,313 @@

+from warnings import warn
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import ASTConfig, ASTModel
+class StudentAST(nn.Module):
+    """
+    A student model for audio classification using the AST architecture.
+    :param n_classes: The number of classes to classify.
+    :type n_classes: int
+    :param hidden_size: The number of units in the hidden layers, defaults to 384.
+    :type hidden_size: int, optional
+    :param num_heads: The number of attention heads to use, defaults to 6.
+    :type num_heads: int, optional
+    """
+    def __init__(self, n_classes: int, hidden_size: int = 384, num_heads: int = 6):
+        super().__init__()
+        config = ASTConfig(hidden_size=hidden_size, num_attention_heads=num_heads, intermediate_size=hidden_size * 4)
+        self.base_model = ASTModel(config=config)
+        self.classifier = StudentClassificationHead(hidden_size, n_classes)
+    def forward(self, x: torch.Tensor):
+        """
+        Forward pass of the student model.
+        :param x: The input tensor of shape [batch_size, sequence_length, input_dim].
+        :type x: torch.Tensor
+        :return: The output tensor of shape [batch_size, n_classes].
+        :rtype: torch.Tensor
+        """
+        x = self.base_model(x)[0]
+        x = self.classifier(x)
+        return x
+class StudentClassificationHead(nn.Module):
+    """
+    A classification head for the student model.
+    :param emb_size: The size of the embedding.
+    :type emb_size: int
+    :param n_classes: The number of classes to classify.
+    :type n_classes: int
+    """
+    def __init__(self, emb_size: int, n_classes: int):
+        super().__init__()
+        self.cls_head = nn.Linear(emb_size, n_classes)
+        self.dist_head = nn.Linear(emb_size, n_classes)
+    def forward(self, x: torch.Tensor):
+        """
+        Forward pass of the classification head.
+        :param x: The input tensor of shape [batch_size, emb_size*2].
+        :type x: torch.Tensor
+        :return: The output tensor of shape [batch_size, n_classes].
+        :rtype: torch.Tensor
+        """
+        x_cls, x_dist = x[:, 0], x[:, 1]
+        x_cls_head = self.cls_head(x_cls)
+        x_dist_head = self.dist_head(x_dist)
+        if self.training:
+            x = x_cls_head, x_dist_head
+        else:
+            x = (x_cls_head + x_dist_head) / 2
+        return x
+class ASTPretrained(nn.Module):
+    """
+    This class implements a PyTorch module for a pre-trained Audio Set Transformer (AST) model
+    fine-tuned on MIT's dataset for audio event classification.
+    :param n_classes: The number of classes for audio event classification.
+    :type n_classes: int
+    :param dropout: The dropout probability for the fully connected layer, defaults to 0.5.
+    :type dropout: float, optional
+    :raises ValueError: If n_classes is not positive.
+    :raises TypeError: If dropout is not a float or is not between 0 and 1.
+    :return: The output tensor of shape [batch_size, n_classes] containing the probabilities of each class.
+    :rtype: torch.Tensor
+    """
+    def __init__(self, n_classes: int, download_weights: bool = True, freeze_body: bool = False, dropout: float = 0.5):
+        super().__init__()
+        if download_weights:
+            self.base_model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
+        else:
+            config = ASTConfig()
+            self.base_model = ASTModel(config=config)
+        if freeze_body:
+            self.base_model = freeze(self.base_model)
+        fc_in = self.base_model.config.hidden_size
+        self.classifier = nn.Sequential(
+            nn.LayerNorm((fc_in,), eps=1e-12), nn.Dropout(p=dropout), nn.Linear(fc_in, n_classes)
+        )
+    def forward(self, x):
+        """Passes the input tensor through the pre-trained Audio Set Transformer (AST) model
+        followed by a fully connected layer.
+        :param x: The input tensor of shape [batch_size, seq_len, num_features].
+        :type x: torch.Tensor
+        :return: The output tensor of shape [batch_size, n_classes] containing the probabilities of each class.
+        :rtype: torch.Tensor
+        :raises ValueError: If the shape of x is not [batch_size, seq_len, num_features].
+        """
+        x = self.base_model(x)[1]
+        x = self.classifier(x)
+        return x
+def layerwise_lr_decay(config, model: ASTModel):
+    """
+    LLRD (Layer-wise Learning Rate Decay) function computes the learning rate for each layer in a deep neural network
+    using a specific decay rate and a base learning rate for the optimizer.
+    :param config: A configuration object that contains the parameters required for LLRD.
+    :type config: Any
+    :param model: A PyTorch neural network model.
+    :type model: ASTModel
+    :raises Warning: If the configuration object does not contain the LLRD parameters.
+    :return: A dictionary containing the optimizer parameters (parameters, weight decay, and learning rate)
+        for each layer.
+    :rtype: dict
+    """
+    try:
+        config = config.LLRD
+    except Exception:
+        warn("No LLRD found in config. Learner will use single lr for whole model.")
+        return None
+    lr = config["base_lr"]
+    weight_decay = config["weight_decay"]
+    no_decay = ["bias", "layernorm"]
+    body = ["embeddings", "encoder.layer"]
+    head_params = [(n, p) for n, p in model.named_parameters() if not any(body_param in n for body_param in body)]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in head_params if not any(nd in n for nd in no_decay)],
+            "weight_decay": weight_decay,
+            "lr": lr,
+        },
+        {
+            "params": [p for n, p in head_params if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+            "lr": lr,
+        },
+    ]
+    # initialize lrs for every layer
+    layers = [getattr(model.module, config["body"]).embeddings] + list(
+        getattr(model.module, config["body"]).encoder.layer
+    )
+    layers.reverse()
+    for layer in layers:
+        lr *= config["lr_decay_rate"]
+        optimizer_grouped_parameters += [
+            {
+                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
+                "weight_decay": weight_decay,
+                "lr": lr,
+            },
+            {
+                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
+                "weight_decay": 0.0,
+                "lr": lr,
+            },
+        ]
+    return optimizer_grouped_parameters
+def freeze(model: nn.Module):
+    """
+    Freeze function sets the requires_grad attribute to False for all parameters
+    in the given PyTorch neural network model. This is used to freeze the weights of
+    the model during training or inference.
+    :param model: A PyTorch neural network model.
+    :type model: nn.Module
+    :return: The same model with requires_grad attribute set to False for all parameters.
+    :rtype: nn.Module
+    """
+    model.eval()
+    for param in model.parameters():
+        param.requires_grad = False
+    return model
+def unfreeze(model: nn.Module):
+    """
+    Unfreeze the model by setting requires_grad to True for all parameters.
+    :param model: The model to unfreeze.
+    :type model: nn.Module
+    :return: The unfrozen model.
+    :rtype: nn.Module
+    """
+    model.train()
+    for param in model.parameters():
+        param.requires_grad = True
+    return model
+def interpolate_params(student: nn.Module, teacher: nn.Module):
+    """
+    Interpolate parameters between two models. This function scales the parameters of the
+    teacher model to match the shape of the corresponding parameters in the student model
+    using bilinear interpolation. If the shapes of the parameters in the two models are already the same,
+    the parameters are unchanged.
+    :param student: The student model.
+    :type student: nn.Module
+    :param teacher: The teacher model.
+    :type teacher: nn.Module
+    :return: A dictionary of interpolated parameters for the student model.
+    :rtype: dict
+    """
+    new_params = {}
+    # Iterate over the parameters in the first model
+    for name, param in teacher.base_model.named_parameters():
+        # Scale the parameter using interpolate if its shape is different from that of the second model
+        target_param = student.base_model.state_dict()[name]
+        if param.shape != target_param.shape:
+            squeeze_count = 0
+            permuted = False
+            while param.ndim < 4:
+                param = param.unsqueeze(0)
+                squeeze_count += 1
+            if param.shape[0] > 1:
+                param = param.permute(1, 2, 3, 0)
+                target_param = target_param.permute(1, 2, 3, 0)
+                permuted = True
+            if target_param.ndim < 2:
+                target_param = target_param.unsqueeze(0)
+            scaled_param = F.interpolate(param, size=(target_param.shape[-2:]), mode="bilinear")
+            while squeeze_count > 0:
+                scaled_param = scaled_param.squeeze(0)
+                squeeze_count -= 1
+            if permuted:
+                scaled_param = scaled_param.permute(-1, 0, 1, 2)
+        else:
+            scaled_param = param
+        new_params[name] = scaled_param
+    return new_params
+def average_model_weights(model_weights_list):
+    """
+    Compute the average weights of a list of PyTorch models.
+    :param model_weights_list: A list of file paths to PyTorch model weight files.
+    :type model_weights_list: List[str]
+    :raises ValueError: If the input list is empty.
+    :return: A dictionary containing the average weights of the models.
+    :rtype: Dict[str, torch.Tensor]
+    """
+    if not model_weights_list:
+        raise ValueError("The input list cannot be empty.")
+    num_models = len(model_weights_list)
+    averaged_weights = {}
+    # Load the first model weights
+    state_dict = torch.load(model_weights_list[0])
+    # Iterate through the remaining models and add their weights to the first model's weights
+    for i in range(1, num_models):
+        state_dict_i = torch.load(model_weights_list[i])
+        for key in state_dict.keys():
+            state_dict[key] += state_dict_i[key]
+    # Compute the average of the weights
+    for key in state_dict.keys():
+        averaged_weights[key] = state_dict[key] / num_models
+    return averaged_weights

src/modeling/preprocess.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import itertools
+import os
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+import librosa
+import numpy as np
+import pandas as pd
+import soundfile as sf
+from joblib import Parallel, delayed
+from sklearn.model_selection import StratifiedGroupKFold
+from tqdm.autonotebook import tqdm
+from modeling.transforms import LabelsFromTxt, ParentMultilabel
+from modeling.utils import get_file_info, sync_bpm, sync_onset, sync_pitch
+def generate_metadata(
+    data_dir: Union[str, Path],
+    save_path: str = ".",
+    subset: str = "train",
+    extract_music_features: bool = False,
+    n_jobs: int = -2,
+):
+    """
+    Generate metadata CSV file containing information about audio files in a directory.
+    :param data_dir: Directory containing audio files.
+    :type data_dir: Union[str, Path]
+    :param save_path: Directory path to save metadata CSV file.
+    :type save_path: str
+    :param subset: Subset of the dataset (train or test), defaults to 'train'.
+    :type subset: str
+    :param extract_music_features: Flag to indicate whether to extract music features or not, defaults to False.
+    :type extract_music_features: bool
+    :param n_jobs: Number of parallel jobs to run, defaults to -2.
+    :type n_jobs: int
+    :raises FileNotFoundError: If the provided data directory does not exist.
+    :return: DataFrame containing the metadata information.
+    :rtype: pandas.DataFrame
+    """
+    data_dir = Path(data_dir) if isinstance(data_dir, str) else data_dir
+    if subset == "train":
+        pattern = r"(.*)__[\d]+$"
+        label_extractor = ParentMultilabel()
+    else:
+        pattern = r"(.*)-[\d]+$"
+        label_extractor = LabelsFromTxt()
+    sound_files = list(data_dir.glob("**/*.wav"))
+    output = Parallel(n_jobs=n_jobs)(delayed(get_file_info)(path, extract_music_features) for path in tqdm(sound_files))
+    df = pd.DataFrame(data=output)
+    df["fname"] = df.path.map(lambda x: Path(x).stem)
+    df["song_name"] = df.fname.str.extract(pattern)
+    df["inst"] = df.path.map(lambda x: "-".join(sorted(list(label_extractor(x)))))
+    df["label_count"] = df.inst.map(lambda x: len(x.split("-")))
+    df.to_csv(f"{save_path}/metadata_{subset}.csv", index=False)
+    return df
+def create_test_split(metadata_path: str, txt_save_path: str, random_state: Optional[int] = None):
+    """Create test split by generating a list of test songs and saving them to a text file.
+    :param metadata_path: Path to the CSV file containing metadata of all songs
+    :type metadata_path: str
+    :param txt_save_path: Path to the directory where the text file containing test songs will be saved
+    :type txt_save_path: str
+    :param random_state: Seed value for the random number generator, defaults to None
+    :type random_state: int, optional
+    :raises TypeError: If metadata_path or txt_save_path is not a string or if random_state is not an integer or None
+    :raises FileNotFoundError: If metadata_path does not exist
+    :raises PermissionError: If the program does not have permission to write to txt_save_path
+    :return: None
+    :rtype: None
+    """
+    df = pd.read_csv(metadata_path)
+    kf = StratifiedGroupKFold(n_splits=2, shuffle=True, random_state=random_state)
+    splits = kf.split(df.fname, df.inst, groups=df.song_name)
+    _, test = list(splits)[0]
+    test_songs = df.iloc[test].fname.sort_values().to_numpy()
+    with open(f"{txt_save_path}/test_songs.txt", "w") as f:
+        # iterate over the list of names and write each one to a new line in the file
+        for song in test_songs:
+            f.write(song + "\n")
+class IRMASPreprocessor:
+    """
+    A class to preprocess IRMAS dataset metadata and create a mapping between
+    file paths and their corresponding instrument labels.
+    :param metadata: A pandas DataFrame or path to csv file containing metadata, defaults to None
+    :type metadata: Union[pd.DataFrame, str], optional
+    :param data_dir: Path to the directory containing the IRMAS dataset, defaults to None
+    :type data_dir: Union[str, Path], optional
+    :param sample_rate: Sample rate of the audio files, defaults to 16000
+    :type sample_rate: int, optional
+    :raises AssertionError: Raised when metadata is None and data_dir is also None.
+    :return: An instance of IRMASPreprocessor
+    :rtype: IRMASPreprocessor
+    """
+    def __init__(
+        self, metadata: Union[pd.DataFrame, str] = None, data_dir: Union[str, Path] = None, sample_rate: int = 16000
+    ):
+        if metadata is not None:
+            self.metadata = pd.read_csv(metadata) if isinstance(metadata, str) else metadata
+            if data_dir is not None:
+                self.metadata["path"] = self.metadata.apply(lambda x: f"{data_dir}/{x.inst}/{x.fname}.wav", axis=1)
+        else:
+            assert data_dir is not None, "No metadata found. Need to provide data directory"
+            self.metadata = generate_metadata(data_dir=data_dir, subset="train", extract_music_features=True)
+        self.instruments = self.metadata.inst.unique()
+        self.sample_rate = sample_rate
+    def preprocess_and_mix(self, save_dir: str, sync: str, ordered: bool, num_track_to_mix: int, n_jobs: int = -2):
+        """
+        A method to preprocess and mix audio tracks from the IRMAS dataset.
+        :param save_dir: The directory to save the preprocessed and mixed tracks
+        :type save_dir: str
+        :param sync: The column name used to synchronize the audio tracks during mixing
+        :type sync: str
+        :param ordered: Whether to order the metadata by the sync column before mixing the tracks
+        :type ordered: bool
+        :param num_track_to_mix: The number of tracks to mix together
+        :type num_track_to_mix: int
+        :param n_jobs: The number of parallel jobs to run, defaults to -2
+        :type n_jobs: int, optional
+        :raises None
+        :return: None
+        :rtype: None
+        """
+        combs = itertools.combinations(self.instruments, r=num_track_to_mix)
+        if ordered:
+            self.metadata = self.metadata.sort_values(by=sync)
+        else:
+            self.metadata = self.metadata.sample(frac=1)
+        Parallel(n_jobs=n_jobs)(delayed(self._mix)(insts, save_dir, sync) for (insts) in tqdm(combs))
+        print("Parallel preprocessing done!")
+    def _mix(self, insts: Tuple[str], save_dir: str, sync: str):
+        """
+        A private method to mix audio tracks and save them to disk.
+        :param insts: A tuple of instrument labels to mix
+        :type insts: Tuple[str]
+        :param save_dir: The directory to save the mixed tracks
+        :type save_dir: str
+        :param sync: The column name used to synchronize the audio tracks during mixing
+        :type sync: str
+        :raises None
+        :return: None
+        :rtype: None
+        """
+        save_dir = self._create_save_dir(insts, save_dir)
+        insts_files_list = [self._get_filepaths(inst) for inst in insts]
+        max_length = max([inst_files.shape[0] for inst_files in insts_files_list])
+        for i, inst_files in enumerate(insts_files_list):
+            if inst_files.shape[0] < max_length:
+                diff = max_length - inst_files.shape[0]
+                inst_files = np.pad(inst_files, (0, diff), mode="symmetric")
+            insts_files_list[i] = [Path(x) for x in inst_files]
+        self._mix_files_and_save(insts_files_list, save_dir, sync)
+    def _get_filepaths(self, inst: str):
+        """
+        A private method to retrieve file paths of audio tracks for a given instrument label.
+        :param inst: The label of the instrument for which to retrieve the file paths
+        :type inst: str
+        :raises KeyError: Raised when the instrument label is not found in the metadata.
+        :return: A numpy array of file paths corresponding to the instrument label.
+        :rtype: numpy.ndarray
+        """
+        metadata = self.metadata.loc[self.metadata.inst == inst]
+        if metadata.empty:
+            raise KeyError("Instrument not found. Please regenerate metadata!")
+        files = metadata.path.to_numpy()
+        return files
+    def _mix_files_and_save(self, insts_files_list: List[List[Path]], save_dir: str, sync: str):
+        """
+        A private method to mix audio files, synchronize them using a given column name in the metadata,
+        and save the mixed file to disk.
+        :param insts_files_list: A list of lists of file paths corresponding to each instrument label
+        :type insts_files_list: List[List[Path]]
+        :param save_dir: The directory to save the mixed tracks
+        :type save_dir: str
+        :param sync: The column name used to synchronize the audio tracks during mixing
+        :type sync: str
+        :raises None
+        :return: None
+        :rtype: None
+        """
+        for i in range(len(insts_files_list[0])):
+            files_to_sync = [inst_files[i] for inst_files in insts_files_list]
+            new_name = f"{'-'.join([file.stem for file in files_to_sync])}.wav"
+            synced_file = self._sync_and_mix(files_to_sync, sync)
+            sf.write(os.path.join(save_dir, new_name), synced_file, samplerate=self.sample_rate)
+    def _sync_and_mix(self, files_to_sync: List[Path], sync: str):
+        """
+        Synchronize and mix audio files.
+        :param files_to_sync: A list of file paths to synchronize and mix.
+        :type files_to_sync: List[Path]
+        :param sync: The type of synchronization to use. One of ['bpm', 'pitch', None].
+        :type sync: str, optional
+        :raises KeyError: If any file in files_to_sync is not found in metadata.
+        :return: The synchronized and mixed audio signal.
+        :rtype: numpy.ndarray
+        """
+        cols = ["pitch", "bpm", "onset"]
+        files_metadata_df = self.metadata.loc[
+            self.metadata.path.isin([str(file_path) for file_path in files_to_sync])
+        ].set_index("path")
+        num_files = files_metadata_df.shape[0]
+        if num_files != len(files_to_sync):
+            raise KeyError("File not found in metadata. Please regenerate")
+        if sync is not None:
+            mean_features = files_metadata_df[cols].mean().to_dict()
+        metadata_dict = files_metadata_df.to_dict("index")
+        for i, (file_to_sync_path, features) in enumerate(metadata_dict.items()):
+            file_to_sync, sr_sync = librosa.load(file_to_sync_path, sr=None)
+            if sr_sync != 44100:
+                file_to_sync = librosa.resample(y=file_to_sync, orig_sr=sr_sync, target_sr=self.sample_rate)
+            if sync == "bpm":
+                file_to_sync = sync_bpm(file_to_sync, sr_sync, bpm_base=mean_features["bpm"], bpm=features["bpm"])
+            if sync == "pitch":
+                file_to_sync = sync_pitch(
+                    file_to_sync, sr_sync, pitch_base=mean_features["pitch"], pitch=features["pitch"]
+                )
+            if sync is not None:
+                file_to_sync = sync_onset(
+                    file_to_sync, sr_sync, onset_base=mean_features["onset"], onset=features["onset"]
+                )
+            file_to_sync = librosa.util.normalize(file_to_sync)
+            if i == 0:
+                mixed_sound = np.zeros_like(file_to_sync)
+            if mixed_sound.shape[0] > file_to_sync.shape[0]:
+                file_to_sync = np.resize(file_to_sync, mixed_sound.shape)
+            else:
+                mixed_sound = np.resize(mixed_sound, file_to_sync.shape)
+            mixed_sound += file_to_sync
+        mixed_sound /= num_files
+        return librosa.resample(y=mixed_sound, orig_sr=44100, target_sr=self.sample_rate)
+    def _create_save_dir(self, insts: Union[Tuple[str], List[str]], save_dir: str):
+        """
+        Create and return a directory to save instrument-specific files.
+        :param insts: A tuple or list of instrument names.
+        :type insts: Union[Tuple[str], List[str]]
+        :param save_dir: The path to the directory where the new directory will be created.
+        :type save_dir: str
+        :return: The path to the newly created directory.
+        :rtype: str
+        """
+        new_dir_name = "-".join(insts)
+        new_dir_path = os.path.join(save_dir, new_dir_name)
+        os.makedirs(new_dir_path, exist_ok=True)
+        return new_dir_path
+    @classmethod
+    def from_metadata(cls, metadata_path: str, **kwargs):
+        """
+        Create a new instance of the class from a metadata file.
+        :param metadata_path: The path to the metadata file.
+        :type metadata_path: str
+        :param **kwargs: Additional keyword arguments to pass to the class constructor.
+        :return: A new instance of the class.
+        :rtype: cls
+        """
+        metadata = pd.read_csv(metadata_path)
+        return cls(metadata, **kwargs)
+if __name__ == "__main__":
+    data_dir = "/home/kpintaric/lumen-irmas/data/raw/IRMAS_Training_Data"
+    metadata_path = "/home/kpintaric/lumen-irmas/data/metadata_train.csv"
+    preprocess = IRMASPreprocessor(metadata=metadata_path, data_dir=data_dir)
+    preprocess.preprocess_and_mix(save_dir="data", sync="pitch", ordered=False, num_track_to_mix=3)
+    a = 1

src/modeling/transforms.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import os
+from abc import ABC, abstractmethod
+from functools import partial
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchaudio
+from torchaudio.transforms import FrequencyMasking, TimeMasking
+from torchvision.transforms import Compose
+from transformers import ASTFeatureExtractor
+class Transform(ABC):
+    """Abstract base class for audio transformations."""
+    @abstractmethod
+    def __call__(self):
+        """
+        Abstract method to apply the transformation.
+        :raises NotImplementedError: If the subclass does not implement this method.
+        """
+        pass
+class Preprocess(ABC):
+    """Abstract base class for preprocessing data.
+    This class defines the interface for preprocessing data. Subclasses must implement the call method.
+    """
+    @abstractmethod
+    def __call__(self):
+        """Process the data.
+        This method must be implemented by subclasses.
+        :raises NotImplementedError: Subclasses must implement this method.
+        """
+        pass
+class OneHotEncode(Transform):
+    """Transform labels to one-hot encoded tensor.
+    This class is a transform that takes a list of labels and returns a one-hot encoded tensor.
+    The labels are converted to a tensor with one-hot encoding using the specified classes.
+    :param c: A list of classes to be used for one-hot encoding.
+    :type c: list
+    :return: A one-hot encoded tensor.
+    :rtype: torch.Tensor
+    """
+    def __init__(self, c: list):
+        self.c = c
+    def __call__(self, labels):
+        """
+        Transform labels to one-hot encoded tensor.
+        :param labels: A list of labels to be encoded.
+        :type labels: list
+        :return: A one-hot encoded tensor.
+        :rtype: torch.Tensor
+        """
+        target = torch.zeros(len(self.c), dtype=torch.float)
+        for label in labels:
+            idx = self.c.index(label)
+            target[idx] = 1
+        return target
+class ParentMultilabel(Transform):
+    """
+    A transform that extracts a list of labels from the parent directory name of a file path.
+    :param sep: The separator used to split the parent directory name into labels. Defaults to " ".
+    :type sep: str
+    """
+    def __init__(self, sep=" "):
+        self.sep = sep
+    def __call__(self, path):
+        """
+        Extract a list of labels from the parent directory name of a file path.
+        :param path: The file path from which to extract labels.
+        :type path: str
+        :return: A list of labels extracted from the parent directory name of the input file path.
+        :rtype: List[str]
+        """
+        label = path.split(os.path.sep)[-2].split(self.sep)
+        return label
+class LabelsFromTxt(Transform):
+    """
+    Extract multilabel parent directory from file path.
+    This class is a transform that extracts a multilabel parent directory from a file path.
+    The directory names are split by a specified separator.
+    :param sep: The separator used to split the directory names. Defaults to " ".
+    :type sep: str
+    """
+    def __init__(self, delimiter=None):
+        self.delimiter = delimiter
+    def __call__(self, path):
+        """
+        Extract multilabel parent directory from file path.
+        :param path: The path of the file to extract the multilabel directory from.
+        :type path: str
+        :return: A list of directory names representing the multilabel parent directory.
+        :rtype: list
+        """
+        path = path.replace("wav", "txt")
+        label = np.loadtxt(path, dtype=str, ndmin=1, delimiter=self.delimiter)
+        return label
+class PreprocessPipeline(Preprocess):
+    """A preprocessing pipeline for audio data.
+    This class is a preprocessing pipeline for audio data.
+    The pipeline includes resampling to a target sampling rate, mixing down stereo to mono,
+    and loading audio from a file.
+    :param target_sr: The target sampling rate to resample to.
+    :type target_sr: int
+    """
+    def __init__(self, target_sr):
+        self.target_sr = target_sr
+    def __call__(self, path):
+        """
+        Preprocess audio data using a pipeline.
+        :param path: The path to the audio file to load.
+        :type path: str
+        :return: A NumPy array of preprocessed audio data.
+        :rtype: numpy.ndarray
+        """
+        signal, sr = torchaudio.load(path)
+        signal = self._resample(signal, sr)
+        signal = self._mix_down(signal)
+        return signal.numpy()
+    def _mix_down(self, signal):
+        """
+        Mix down stereo to mono.
+        :param signal: The audio signal to mix down.
+        :type signal: torch.Tensor
+        :return: The mixed down audio signal.
+        :rtype: torch.Tensor
+        """
+        if signal.shape[0] > 1:
+            signal = torch.mean(signal, dim=0, keepdim=True)
+        return signal
+    def _resample(self, signal, input_sr):
+        """
+        Resample audio signal to a target sampling rate.
+        :param signal: The audio signal to resample.
+        :type signal: torch.Tensor
+        :param input_sr: The current sampling rate of the audio signal.
+        :type input_sr: int
+        :return: The resampled audio signal.
+        :rtype: torch.Tensor
+        """
+        if input_sr != self.target_sr:
+            resampler = torchaudio.transforms.Resample(input_sr, self.target_sr)
+            signal = resampler(signal)
+        return signal
+class SpecToImage(Transform):
+    def __init__(self, mean=None, std=None, eps=1e-6):
+        self.mean = mean
+        self.std = std
+        self.eps = eps
+    def __call__(self, spec):
+        spec = torch.stack([spec, spec, spec], dim=-1)
+        mean = torch.mean(spec) if self.mean is None else self.mean
+        std = torch.std(spec) if self.std is None else self.std
+        spec_norm = (spec - mean) / std
+        spec_min, spec_max = torch.min(spec_norm), torch.max(spec_norm)
+        spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
+        return spec_scaled.type(torch.uint8)
+class MinMaxScale(Transform):
+    def __call__(self, spec):
+        spec_min, spec_max = torch.min(spec), torch.max(spec)
+        return (spec - spec_min) / (spec_max - spec_min)
+class Normalize(Transform):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, spec):
+        return (spec - self.mean) / self.std
+class FeatureExtractor(Transform):
+    """Extract features from audio signal using an AST feature extractor.
+    This class is a transform that extracts features from an audio signal using an AST feature extractor.
+    The features are returned as a PyTorch tensor.
+    :param sr: The sampling rate of the audio signal.
+    :type sr: int
+    """
+    def __init__(self, sr):
+        self.transform = partial(ASTFeatureExtractor(), sampling_rate=sr, return_tensors="pt")
+    def __call__(self, signal):
+        """
+        Extract features from audio signal using an AST feature extractor.
+        :param signal: The audio signal to extract features from.
+        :type signal: numpy.ndarray
+        :return: A tensor of extracted audio features.
+        :rtype: torch.Tensor
+        """
+        return self.transform(signal.squeeze()).input_values.mT
+class Preemphasis(Transform):
+    """perform preemphasis on the input signal.
+    :param signal: The signal to filter.
+    :param coeff: The preemphasis coefficient. 0 is none, default 0.97.
+    :returns: the filtered signal.
+    """
+    def __init__(self, coeff: float = 0.97):
+        self.coeff = coeff
+    def __call__(self, signal):
+        return torch.cat([signal[:, :1], signal[:, 1:] - self.coeff * signal[:, :-1]], dim=1)
+class Spectrogram(Transform):
+    def __init__(self, sample_rate, n_mels, hop_length, n_fft):
+        self.transform = torchaudio.transforms.MelSpectrogram(
+            sample_rate=sample_rate, n_mels=n_mels, hop_length=hop_length, n_fft=n_fft, f_min=20, center=False
+        )
+    def __call__(self, signal):
+        return self.transform(signal)
+class LogTransform(Transform):
+    def __call__(self, signal):
+        return torch.log(signal + 1e-8)
+class PadCutToLength(Transform):
+    def __init__(self, max_length):
+        self.max_length = max_length
+    def __call__(self, spec):
+        seq_len = spec.shape[-1]
+        if seq_len > self.max_length:
+            return spec[..., : self.max_length]
+        if seq_len < self.max_length:
+            diff = self.max_length - seq_len
+            return F.pad(spec, (0, diff), mode="constant", value=0)
+class CustomFeatureExtractor(Transform):
+    def __init__(self, sample_rate, n_mels, hop_length, n_fft, max_length, mean, std):
+        self.extract = Compose(
+            [
+                Preemphasis(),
+                Spectrogram(sample_rate=sample_rate, n_mels=n_mels, hop_length=hop_length, n_fft=n_fft),
+                LogTransform(),
+                PadCutToLength(max_length=max_length),
+                Normalize(mean=mean, std=std),
+            ]
+        )
+    def __call__(self, x):
+        return self.extract(x)
+class RepeatAudio(Transform):
+    """A transform to repeat audio data.
+    This class is a transform that repeats audio data a random number of times up to a maximum specified value.
+    :param max_repeats: The maximum number of times to repeat the audio data.
+    :type max_repeats: int
+    """
+    def __init__(self, max_repeats: int = 2):
+        self.max_repeats = max_repeats
+    def __call__(self, signal):
+        """
+        Repeat audio data a random number of times up to a maximum specified value.
+        :param signal: The audio data to repeat.
+        :type signal: numpy.ndarray
+        :return: The repeated audio data.
+        :rtype: numpy.ndarray
+        """
+        num_repeats = torch.randint(1, self.max_repeats, (1,)).item()
+        return np.tile(signal, reps=num_repeats)
+class MaskFrequency(Transform):
+    """A transform to mask frequency of a spectrogram.
+    This class is a transform that masks out a random number of consecutive frequencies from a spectrogram.
+    :param max_mask_length: The maximum number of consecutive frequencies to mask out from the spectrogram.
+    :type max_mask_length: int
+    """
+    def __init__(self, max_mask_length: int = 0):
+        self.aug = FrequencyMasking(max_mask_length)
+    def __call__(self, spec):
+        """
+        Mask out a random number of consecutive frequencies from a spectrogram.
+        :param spec: The input spectrogram.
+        :type spec: numpy.ndarray
+        :return: The spectrogram with masked frequencies.
+        :rtype: numpy.ndarray
+        """
+        return self.aug(spec)
+class MaskTime(Transform):
+    """A transform to mask time of a spectrogram.
+    This class is a transform that masks out a random number of consecutive time steps from a spectrogram.
+    :param max_mask_length: The maximum number of consecutive time steps to mask out from the spectrogram.
+    :type max_mask_length: int
+    """
+    def __init__(self, max_mask_length: int = 0):
+        self.aug = TimeMasking(max_mask_length)
+    def __call__(self, spec):
+        """
+        Mask out a random number of consecutive time steps from a spectrogram.
+        :param spec: The input spectrogram.
+        :type spec: numpy.ndarray
+        :return: The spectrogram with masked time steps.
+        :rtype: numpy.ndarray
+        """
+        return self.aug(spec)

src/modeling/utils.py ADDED Viewed

	@@ -0,0 +1,336 @@

+from glob import glob
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Union
+import librosa
+import numpy as np
+import yaml
+CLASSES = ["tru", "sax", "vio", "gac", "org", "cla", "flu", "voi", "gel", "cel", "pia"]
+def get_wav_files(base_path):
+    """
+    Function to recursively get all the .wav files in a directory.
+    :param base_path: The base path of the directory to search.
+    :type base_path: str or pathlib.Path
+    :return: A list of paths to .wav files found in the directory.
+    :rtype: List[str]
+    """
+    return glob(f"{base_path}/**/*.wav", recursive=True)
+def parse_config(config_path):
+    """
+    Parse a YAML configuration file and return the configuration as a SimpleNamespace object.
+    :param config_path: The path to the YAML configuration file.
+    :type config_path: str or pathlib.Path
+    :return: A SimpleNamespace object representing the configuration.
+    :rtype: types.SimpleNamespace
+    """
+    with open(config_path) as file:
+        return SimpleNamespace(**yaml.safe_load(file))
+def init_transforms(fn_dict, module):
+    """
+    Initialize a list of transforms from a dictionary of function names and their parameters.
+    :param fn_dict: A dictionary where keys are the names of transform functions
+        and values are dictionaries of parameters.
+    :type fn_dict: Dict[str, Dict[str, Any]]
+    :param module: The module where the transform functions are defined.
+    :type module: module
+    :return: A list of transform functions.
+    :rtype: List[Callable]
+    """
+    transforms = init_objs(fn_dict, module)
+    if transforms is not None:
+        transforms = ComposeTransforms(transforms)
+    return transforms
+def init_objs(fn_dict, module):
+    """
+    Initialize a list of objects from a dictionary of object names and their parameters.
+    :param fn_dict: A dictionary where keys are the names of object classes and values are dictionaries of parameters.
+    :type fn_dict: Dict[str, Dict[str, Any]]
+    :param module: The module where the object classes are defined.
+    :type module: module
+    :return: A list of objects.
+    :rtype: List[Any]
+    """
+    if fn_dict is None:
+        return None
+    transforms = []
+    for transform in fn_dict.keys():
+        fn = getattr(module, transform)
+        if fn is None:
+            raise NotImplementedError(
+                "The attribute '{}' is not implemented in the module '{}'.".format(transform, module.__name__)
+            )
+        fn_args = fn_dict[transform]
+        if fn_args is None:
+            transforms.append(fn())
+        else:
+            transforms.append(fn(**fn_args))
+    return transforms
+def init_obj(fn_dict, module, *args, **kwargs):
+    """
+    Initialize an object by calling a function with the provided arguments.
+    :param fn_dict: A dictionary that maps the function name to its arguments.
+    :type fn_dict: dict or None
+    :param module: The module containing the function.
+    :type module: module
+    :param args: The positional arguments for the function.
+    :type args: tuple
+    :param kwargs: The keyword arguments for the function.
+    :type kwargs: dict
+    :raises AssertionError: If a keyword argument is already specified in fn_dict.
+    :return: The result of calling the function with the provided arguments.
+    :rtype: Any
+    """
+    if fn_dict is None:
+        return None
+    name = list(fn_dict.keys())[0]
+    fn = getattr(module, name)
+    if fn is None:
+        raise NotImplementedError(
+            "The attribute '{}' is not implemented in the module '{}'.".format(name, module.__name__)
+        )
+    fn_args = fn_dict[name]
+    if fn_args is not None:
+        assert all(k not in fn_args for k in kwargs)
+        fn_args.update(kwargs)
+        return fn(*args, **fn_args)
+    else:
+        return fn(*args, **kwargs)
+class ComposeTransforms:
+    """
+    Composes a list of transforms to be applied in sequence to input data.
+    :param transforms: A list of transforms to be applied.
+    :type transforms: List[callable]
+    """
+    def __init__(self, transforms: list):
+        self.transforms = transforms
+    def __call__(self, data, *args):
+        for t in self.transforms:
+            data = t(data, *args)
+        return data
+def load_raw_file(path: Union[str, Path]):
+    """
+    Loads an audio file from disk and returns its raw waveform and sample rate.
+    :param path: The path to the audio file to load.
+    :type path: Union[str, Path]
+    :return: A tuple containing the raw waveform and sample rate.
+    :rtype: tuple
+    """
+    return librosa.load(path, sr=None, mono=False)
+def get_onset(signal, sr):
+    """
+    Computes the onset of an audio signal.
+    :param signal: The audio signal.
+    :type signal: np.ndarray
+    :param sr: The sample rate of the audio signal.
+    :type sr: int
+    :return: The onset of the audio signal in seconds.
+    :rtype: float
+    """
+    onset = librosa.onset.onset_detect(y=signal, sr=sr, units="time")[0]
+    return onset
+def get_bpm(signal, sr):
+    """
+    Computes the estimated beats per minute (BPM) of an audio signal.
+    :param signal: The audio signal.
+    :type signal: np.ndarray
+    :param sr: The sample rate of the audio signal.
+    :type sr: int
+    :return: The estimated BPM of the audio signal, or None if the BPM cannot be computed.
+    :rtype: Union[float, None]
+    """
+    bpm, _ = librosa.beat.beat_track(y=signal, sr=sr)
+    return bpm if bpm != 0 else None
+def get_pitch(signal, sr):
+    """
+    Computes the estimated pitch of an audio signal.
+    :param signal: The audio signal.
+    :type signal: np.ndarray
+    :param sr: The sample rate of the audio signal.
+    :type sr: int
+    :return: The estimated pitch of the audio signal in logarithmic scale, or None if the pitch cannot be computed.
+    :rtype: Union[float, None]
+    """
+    eps = 1e-8
+    fmin = librosa.note_to_hz("C2")
+    fmax = librosa.note_to_hz("C7")
+    pitch, _, _ = librosa.pyin(y=signal, sr=sr, fmin=fmin, fmax=fmax)
+    if not np.isnan(pitch).all():
+        mean_log_pitch = np.nanmean(np.log(pitch + eps))
+    else:
+        mean_log_pitch = None
+    return mean_log_pitch
+def get_file_info(path: Union[str, Path], extract_music_features: bool):
+    """
+    Loads an audio file and computes some basic information about it,
+    such as pitch, BPM, onset time, duration, sample rate, and number of channels.
+    :param path: The path to the audio file.
+    :type path: Union[str, Path]
+    :param extract_music_features: Whether to extract music features such as pitch, BPM, and onset time.
+    :type extract_music_features: bool
+    :return: A dictionary containing information about the audio file.
+    :rtype: dict
+    """
+    path = str(path) if isinstance(path, Path) else path
+    signal, sr = load_raw_file(path)
+    channels = signal.shape[0]
+    signal = librosa.to_mono(signal)
+    duration = len(signal) / sr
+    pitch, bpm, onset = None, None, None
+    if extract_music_features:
+        pitch = get_pitch(signal, sr)
+        bpm = get_bpm(signal, sr)
+        onset = get_onset(signal, sr)
+    return {
+        "path": path,
+        "pitch": pitch,
+        "bpm": bpm,
+        "onset": onset,
+        "sample_rate": sr,
+        "duration": duration,
+        "channels": channels,
+    }
+def sync_pitch(file_to_sync: np.ndarray, sr: int, pitch_base: float, pitch: float):
+    """
+    Shift the pitch of an audio file to match a new pitch value.
+    :param file_to_sync: The input audio file as a NumPy array.
+    :type file_to_sync: np.ndarray
+    :param sr: The sample rate of the input file.
+    :type sr: int
+    :param pitch_base: The pitch value of the original file.
+    :type pitch_base: float
+    :param pitch: The pitch value to synchronize the input file to.
+    :type pitch: float
+    :return: The synchronized audio file as a NumPy array.
+    :rtype: np.ndarray
+    """
+    assert np.ndim(file_to_sync) == 1, "Input array has more than one dimensions"
+    if any(np.isnan(x) for x in [pitch_base, pitch]):
+        return file_to_sync
+    steps = np.round(12 * np.log2(np.exp(pitch_base) / np.exp(pitch)), 0)
+    return librosa.effects.pitch_shift(y=file_to_sync, sr=sr, n_steps=steps)
+def sync_bpm(file_to_sync: np.ndarray, sr: int, bpm_base: float, bpm: float):
+    """
+    Stretch or compress the duration of an audio file to match a new tempo.
+    :param file_to_sync: The input audio file as a NumPy array.
+    :type file_to_sync: np.ndarray
+    :param sr: The sample rate of the input file.
+    :type sr: int
+    :param bpm_base: The tempo of the original file.
+    :type bpm_base: float
+    :param bpm: The tempo to synchronize the input file to.
+    :type bpm: float
+    :return: The synchronized audio file as a NumPy array.
+    :rtype: np.ndarray
+    """
+    assert np.ndim(file_to_sync) == 1, "Input array has more than one dimensions"
+    if any(np.isnan(x) for x in [bpm_base, bpm]):
+        return file_to_sync
+    return librosa.effects.time_stretch(y=file_to_sync, rate=bpm_base / bpm)
+def sync_onset(file_to_sync: np.ndarray, sr: int, onset_base: float, onset: float):
+    """
+    Sync the onset of an audio signal by adding or removing silence at the beginning.
+    :param file_to_sync: The audio signal to synchronize.
+    :type file_to_sync: np.ndarray
+    :param sr: The sample rate of the audio signal.
+    :type sr: int
+    :param onset_base: The onset of the reference signal in seconds.
+    :type onset_base: float
+    :param onset: The onset of the signal to synchronize in seconds.
+    :type onset: float
+    :raises AssertionError: If the input array has more than one dimension.
+    :return: The synchronized audio signal.
+    :rtype: np.ndarray
+    """
+    assert np.ndim(file_to_sync) == 1, "Input array has more than one dimensions"
+    if any(np.isnan(x) for x in [onset_base, onset]):
+        return file_to_sync
+    diff = int(round(abs(onset_base * sr - onset * sr), 0))
+    if onset_base > onset:
+        return np.pad(file_to_sync, (diff, 0), mode="constant", constant_values=0)
+    else:
+        return file_to_sync[diff:]