Spaces:

ArnavGhost
/

AudioDeNoiseAPI

Sleeping

App Files Files Community

arnavkumar24 commited on Apr 12

Commit

89040ed

•

1 Parent(s): ebbe80d

Addon

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

AudioSep_Colab.ipynb +128 -0
CONTRIBUTING.md +92 -0
Dockerfile +22 -0
LICENSE +21 -0
assets/results.png +0 -0
benchmark.py +116 -0
callbacks/base.py +35 -0
checkpoint/audiosep_base_4M_steps.ckpt +3 -0
checkpoint/music_speech_audioset_epoch_15_esc_89.98.pt +3 -0
cog.yaml +21 -0
config/audiosep_base.yaml +41 -0
data/audiotext_dataset.py +91 -0
data/datamodules.py +122 -0
data/waveform_mixers.py +127 -0
datafiles/template.json +8 -0
environment.yml +326 -0
evaluation/evaluate_audiocaps.py +110 -0
evaluation/evaluate_audioset.py +155 -0
evaluation/evaluate_clotho.py +102 -0
evaluation/evaluate_esc50.py +102 -0
evaluation/evaluate_music.py +118 -0
evaluation/evaluate_vggsound.py +114 -0
evaluation/metadata/audiocaps_eval.csv +0 -0
evaluation/metadata/audioset_eval.csv +0 -0
evaluation/metadata/class_labels_indices.csv +528 -0
evaluation/metadata/clotho_eval.csv +0 -0
evaluation/metadata/esc50_eval.csv +0 -0
evaluation/metadata/music_eval.csv +0 -0
evaluation/metadata/vggsound_eval.csv +0 -0
losses.py +17 -0
models/CLAP/__init__.py +0 -0
models/CLAP/__pycache__/__init__.cpython-310.pyc +0 -0
models/CLAP/open_clip/__init__.py +25 -0
models/CLAP/open_clip/__pycache__/__init__.cpython-310.pyc +0 -0
models/CLAP/open_clip/__pycache__/factory.cpython-310.pyc +0 -0
models/CLAP/open_clip/__pycache__/feature_fusion.cpython-310.pyc +0 -0
models/CLAP/open_clip/__pycache__/htsat.cpython-310.pyc +0 -0
models/CLAP/open_clip/__pycache__/loss.cpython-310.pyc +0 -0
models/CLAP/open_clip/__pycache__/model.cpython-310.pyc +0 -0
models/CLAP/open_clip/__pycache__/openai.cpython-310.pyc +0 -0
models/CLAP/open_clip/__pycache__/pann_model.cpython-310.pyc +0 -0
models/CLAP/open_clip/__pycache__/pretrained.cpython-310.pyc +0 -0
models/CLAP/open_clip/__pycache__/timm_model.cpython-310.pyc +0 -0
models/CLAP/open_clip/__pycache__/tokenizer.cpython-310.pyc +0 -0
models/CLAP/open_clip/__pycache__/transform.cpython-310.pyc +0 -0
models/CLAP/open_clip/__pycache__/utils.cpython-310.pyc +0 -0
models/CLAP/open_clip/bert.py +40 -0
models/CLAP/open_clip/bpe_simple_vocab_16e6.txt.gz +3 -0
models/CLAP/open_clip/factory.py +277 -0
models/CLAP/open_clip/feature_fusion.py +192 -0

AudioSep_Colab.ipynb ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from pathlib import Path\n",
+        "\n",
+        "repo_path = Path(\"/content/AudioSep\")\n",
+        "if not repo_path.exists():\n",
+        "    !git clone https://github.com/Audio-AGI/AudioSep.git\n",
+        "\n",
+        "%cd /content/AudioSep"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pjIhw5ECS_3_"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install torchlibrosa==0.1.0 gradio==3.47.1 gdown lightning transformers==4.28.1 ftfy braceexpand webdataset soundfile wget h5py"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "t6h9KB3CcjBd"
+      },
+      "outputs": [],
+      "source": [
+        "checkpoints_dir = Path(\"checkpoint\")\n",
+        "checkpoints_dir.mkdir(exist_ok=True)\n",
+        "\n",
+        "models = (\n",
+        "    (\n",
+        "        \"https://huggingface.co/spaces/badayvedat/AudioSep/resolve/main/checkpoint/audiosep_base_4M_steps.ckpt\",\n",
+        "        checkpoints_dir / \"audiosep_base_4M_steps.ckpt\"\n",
+        "    ),\n",
+        "    (\n",
+        "        \"https://huggingface.co/spaces/badayvedat/AudioSep/resolve/main/checkpoint/music_speech_audioset_epoch_15_esc_89.98.pt\",\n",
+        "        checkpoints_dir / \"music_speech_audioset_epoch_15_esc_89.98.pt\"\n",
+        "    )\n",
+        ")\n",
+        "\n",
+        "for model_url, model_path in models:\n",
+        "    if not model_path.exists():\n",
+        "        !wget {model_url} -O {model_path}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3uDrzCQyY58h"
+      },
+      "outputs": [],
+      "source": [
+        "!wget \"https://audio-agi.github.io/Separate-Anything-You-Describe/demos/exp31_water drops_mixture.wav\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "0nr77CGXTwO1"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "from pipeline import build_audiosep, inference\n",
+        "\n",
+        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+        "\n",
+        "model = build_audiosep(\n",
+        "      config_yaml='config/audiosep_base.yaml',\n",
+        "      checkpoint_path=str(models[0][1]),\n",
+        "      device=device)\n",
+        "\n",
+        "audio_file = 'exp31_water drops_mixture.wav'\n",
+        "text = 'water drops'\n",
+        "output_file='separated_audio.wav'\n",
+        "\n",
+        "# AudioSep processes the audio at 32 kHz sampling rate\n",
+        "inference(model, audio_file, text, output_file, device)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kssOe0pbPSWp"
+      },
+      "outputs": [],
+      "source": [
+        "print(f\"The separated audio is saved to: '{output_file}' file.\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "sl35U3dAR6KN"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,92 @@

+# 🎵 Contributing to AudioSep
+Welcome to the AudioSep repository, where your contributions can harmonize the world of audio separation. To ensure a harmonious and organized collaboration, please follow the contribution guidelines outlined below.
+## **Submitting Contributions**
+To contribute to this project, please adhere to the following steps:
+### **1. Choose or Create an Issue**
+- Start by reviewing the existing issues to identify areas where your contributions can make a significant impact.
+- If you have ideas for new features, enhancements, or bug fixes, feel free to create a new issue to propose your contributions. Provide comprehensive details for clarity.
+### **2. Fork the Repository**
+- To initiate your contribution, fork the primary repository by clicking the "Fork" button. This will create a copy of the repository in your personal GitHub account.
+### **3. Clone Your Forked Repository**
+- Clone your forked repository to your local development environment using the following command:
+```bash
+git clone https://github.com/your-username/AudioSep.git
+```
+### **4. Set Up the Upstream Remote**
+- Maintain a reference to the primary project by adding it as the upstream remote:
+```bash
+cd AudioSep
+git remote add upstream https://github.com/Audio-AGI/AudioSep
+git remote -v
+```
+### **5. Create a New Branch**
+- Before starting your contribution, establish a new branch dedicated to your specific task:
+```bash
+git checkout -b my-contribution
+```
+## **Working on Your Contribution**
+Now that your development environment is ready and a new branch is established, you can start working on your contribution. Please ensure you adhere to the following guidelines:
+### **6. Make Changes**
+- Implement the necessary changes, including code additions, enhancements, or bug fixes. Ensure your contributions are well-structured, documented, and aligned with the project's objectives.
+### **7. Commit Your Changes**
+- Commit your changes using informative commit messages that clearly convey the purpose of your contributions:
+```bash
+git commit -m "Add a descriptive message here"
+```
+### **8. Push Your Changes**
+- Push the committed changes to your remote repository on GitHub:
+```bash
+git push origin my-contribution
+```
+### **9. Create a Pull Request**
+- Visit your repository on GitHub and click the "New Pull Request" button to initiate a pull request from your branch to the primary repository.
+### **10. Await Review**
+- Your pull request will undergo review, and feedback will be provided by the project maintainers or fellow contributors. Be prepared to address any suggested changes or refinements.
+## **Community Engagement**
+While contributing, please consider engaging with the community in the following ways:
+### **11. Join Discussions**
+- Participate in discussions related to audio separation techniques and their applications. Share your insights, experiences, and expertise in the audio field.
+### **12. Share Ideas**
+- If you have innovative ideas for advancing the project or optimizing audio separation, such as new algorithms or research findings, feel free to open issues to initiate productive discussions.
+## **Acknowledgment**
+We appreciate your dedication to the world of audio separation. Your contributions play a crucial role in harmonizing audio and improving the listening experience for all. If you have questions or require assistance, please don't hesitate to contact the project maintainers.
+Thank you for your valuable contributions, and we eagerly anticipate collaborating with you on AudioSep! 🎶🙌

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM python:3.10.11
+# Copy the current directory contents into the container at .
+COPY . .
+# Set the working directory to /
+WORKDIR /
+# Install requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /requirements.txt
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+# Start the FastAPI app on port 7860, the default port expected by Spaces
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+    MIT License
+    Copyright (c) Xubo Liu
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE

assets/results.png ADDED Viewed

benchmark.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+from tqdm import tqdm
+import numpy as np
+from evaluation.evaluate_audioset import AudioSetEvaluator
+from evaluation.evaluate_audiocaps import AudioCapsEvaluator
+from evaluation.evaluate_vggsound import VGGSoundEvaluator
+from evaluation.evaluate_music import MUSICEvaluator
+from evaluation.evaluate_esc50 import ESC50Evaluator
+from evaluation.evaluate_clotho import ClothoEvaluator
+from models.clap_encoder import CLAP_Encoder
+from utils import (
+    load_ss_model,
+    calculate_sdr,
+    calculate_sisdr,
+    parse_yaml,
+    get_mean_sdr_from_dict,
+)
+def eval(checkpoint_path, config_yaml='config/audiosep_base.yaml'):
+    log_dir = 'eval_logs'
+    os.makedirs(log_dir, exist_ok=True)
+    device = "cuda"
+    configs = parse_yaml(config_yaml)
+    # AudioSet Evaluators
+    audioset_evaluator = AudioSetEvaluator()
+    # AudioCaps Evaluator
+    audiocaps_evaluator = AudioCapsEvaluator()
+    # VGGSound+ Evaluator
+    vggsound_evaluator = VGGSoundEvaluator()
+    # Clotho Evaluator
+    clotho_evaluator = ClothoEvaluator()
+    # MUSIC Evaluator
+    music_evaluator = MUSICEvaluator()
+    # ESC-50 Evaluator
+    esc50_evaluator = ESC50Evaluator()
+    # Load model
+    query_encoder = CLAP_Encoder().eval()
+    pl_model = load_ss_model(
+        configs=configs,
+        checkpoint_path=checkpoint_path,
+        query_encoder=query_encoder
+    ).to(device)
+    print(f'-------  Start Evaluation  -------')
+    # evaluation on Clotho
+    SISDR, SDRi = clotho_evaluator(pl_model)
+    msg_clotho = "Clotho Avg SDRi: {:.3f}, SISDR: {:.3f}".format(SDRi, SISDR)
+    print(msg_clotho)
+    # evaluation on VGGSound+ (YAN)
+    SISDR, SDRi = vggsound_evaluator(pl_model)
+    msg_vgg = "VGGSound Avg SDRi: {:.3f}, SISDR: {:.3f}".format(SDRi, SISDR)
+    print(msg_vgg)
+    # evaluation on MUSIC
+    SISDR, SDRi = music_evaluator(pl_model)
+    msg_music = "MUSIC Avg SDRi: {:.3f}, SISDR: {:.3f}".format(SDRi, SISDR)
+    print(msg_music)
+    # evaluation on ESC-50
+    SISDR, SDRi = esc50_evaluator(pl_model)
+    msg_esc50 = "ESC-50 Avg SDRi: {:.3f}, SISDR: {:.3f}".format(SDRi, SISDR)
+    print(msg_esc50)
+    # evaluation on AudioSet
+    stats_dict = audioset_evaluator(pl_model=pl_model)
+    median_sdris = {}
+    median_sisdrs = {}
+    for class_id in range(527):
+        median_sdris[class_id] = np.nanmedian(stats_dict["sdris_dict"][class_id])
+        median_sisdrs[class_id] = np.nanmedian(stats_dict["sisdrs_dict"][class_id])
+    SDRi = get_mean_sdr_from_dict(median_sdris)
+    SISDR = get_mean_sdr_from_dict(median_sisdrs)
+    msg_audioset = "AudioSet Avg SDRi: {:.3f}, SISDR: {:.3f}".format(SDRi, SISDR)
+    print(msg_audioset)
+    # evaluation on AudioCaps
+    SISDR, SDRi = audiocaps_evaluator(pl_model)
+    msg_audiocaps = "AudioCaps Avg SDRi: {:.3f}, SISDR: {:.3f}".format(SDRi, SISDR)
+    print(msg_audiocaps)
+    # evaluation on Clotho
+    SISDR, SDRi = clotho_evaluator(pl_model)
+    msg_clotho = "Clotho Avg SDRi: {:.3f}, SISDR: {:.3f}".format(SDRi, SISDR)
+    print(msg_clotho)
+    msgs = [msg_audioset, msg_vgg, msg_audiocaps, msg_clotho, msg_music, msg_esc50]
+    # open file in write mode
+    log_path = os.path.join(log_dir, 'eval_results.txt')
+    with open(log_path, 'w') as fp:
+        for msg in msgs:
+            fp.write(msg + '\n')
+    print(f'Eval log is written to {log_path} ...')
+    print('-------------------------  Done  ---------------------------')
+if __name__ == '__main__':
+    eval(checkpoint_path='checkpoint/audiosep_base.ckpt')

callbacks/base.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+import lightning.pytorch as pl
+from lightning.pytorch.utilities import rank_zero_only
+class CheckpointEveryNSteps(pl.Callback):
+    def __init__(
+        self,
+        checkpoints_dir,
+        save_step_frequency,
+    ) -> None:
+        r"""Save a checkpoint every N steps.
+        Args:
+            checkpoints_dir (str): directory to save checkpoints
+            save_step_frequency (int): save checkpoint every N step
+        """
+        self.checkpoints_dir = checkpoints_dir
+        self.save_step_frequency = save_step_frequency
+    @rank_zero_only
+    def on_train_batch_end(self, *args, **kwargs) -> None:
+        r"""Save a checkpoint every N steps."""
+        trainer = args[0]
+        global_step = trainer.global_step
+        if global_step == 1 or global_step % self.save_step_frequency == 0:
+            ckpt_path = os.path.join(
+                self.checkpoints_dir,
+                "step={}.ckpt".format(global_step))
+            trainer.save_checkpoint(ckpt_path)
+            print("Save checkpoint to {}".format(ckpt_path))

checkpoint/audiosep_base_4M_steps.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8cda01bfd0ebd141eef45d41db7a3ada23a56568465840d3cff04b8010ce82c
+size 1264844076

checkpoint/music_speech_audioset_epoch_15_esc_89.98.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51c68f12f9d7ea25fdaaccf741ec7f81e93ee594455410f3bca4f47f88d8e006
+size 2352471003

cog.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+build:
+  gpu: true
+  python_version: "3.11"
+  python_packages:
+    - "torchlibrosa==0.1.0"
+    - "lightning==2.1.0"
+    - "torch==2.0.1"
+    - "transformers==4.28.1"
+    - "braceexpand==0.1.7"
+    - "webdataset==0.2.60"
+    - "soundfile==0.12.1"
+    - "torchaudio==2.0.2"
+    - "torchvision==0.15.2"
+    - "h5py==3.10.0"
+    - "ftfy==6.1.1"
+    - "pandas==2.1.1"
+    - "wget==3.2"
+predict: "predict.py:Predictor"

config/audiosep_base.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+---
+task_name: AudioSep
+data:
+    datafiles:
+        - 'datafiles/template.json'
+    sampling_rate: 32000
+    segment_seconds: 5
+    loudness_norm:
+        lower_db: -10
+        higher_db: 10
+    max_mix_num: 2
+model:
+    query_net: CLAP
+    condition_size: 512
+    model_type: ResUNet30
+    input_channels: 1
+    output_channels: 1
+    resume_checkpoint: ""
+    use_text_ratio: 1.0
+train:
+    optimizer:
+        optimizer_type: AdamW
+        learning_rate: 1e-3
+        warm_up_steps: 10000
+        reduce_lr_steps: 1000000
+        lr_lambda_type: constant_warm_up
+    num_nodes: 1
+    num_workers: 6
+    loss_type: l1_wav
+    sync_batchnorm: True
+    batch_size_per_device: 12
+    steps_per_epoch: 10000  # Every 10000 steps is called an `epoch`.
+    evaluate_step_frequency: 10000     # Evaluate every #evaluate_step_frequency steps.
+    save_step_frequency: 20000  # Save every #save_step_frequency steps.
+    early_stop_steps: 10000001
+    random_seed: 1234

data/audiotext_dataset.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import json
+import random
+import torch
+import torchaudio
+from torch.utils.data import Dataset
+class AudioTextDataset(Dataset):
+    """Can sample data from audio-text databases
+    Params:
+    sampling_rate: audio sampling rate
+    max_clip_len: max length (seconds) of audio clip to be sampled
+    """
+    def __init__(
+        self,
+        datafiles=[''],
+        sampling_rate=32000,
+        max_clip_len=5,
+    ):
+        all_data_json = []
+        for datafile in datafiles:
+            with open(datafile, 'r') as fp:
+                data_json = json.load(fp)['data']
+                all_data_json.extend(data_json)
+        self.all_data_json = all_data_json
+        self.sampling_rate = sampling_rate
+        self.max_length = max_clip_len * sampling_rate
+    def __len__(self):
+        return len(self.all_data_json)
+    def _cut_or_randomcrop(self, waveform):
+        # waveform: [1, samples]
+        # random crop
+        if waveform.size(1) > self.max_length:
+            random_idx = random.randint(0, waveform.size(1)-self.max_length)
+            waveform = waveform[:, random_idx:random_idx+self.max_length]
+        else:
+            temp_wav = torch.zeros(1, self.max_length)
+            temp_wav[:, 0:waveform.size(1)] = waveform
+            waveform = temp_wav
+        assert waveform.size(1) == self.max_length, \
+            f"number of audio samples is {waveform.size(1)}"
+        return waveform
+    def _read_audio(self, index):
+        try:
+            audio_path = self.all_data_json[index]['wav']
+            audio_data, audio_rate = torchaudio.load(audio_path, channels_first=True)
+            text = self.all_data_json[index]['caption']
+            # drop short utterance
+            if audio_data.size(1) < self.sampling_rate * 1:
+                raise Exception(f'{audio_path} is too short, drop it ...')
+            return text, audio_data, audio_rate
+        except Exception as e:
+            print(f'error: {e} occurs, when loading {audio_path}')
+            random_index = random.randint(0, len(self.all_data_json)-1)
+            return self._read_audio(index=random_index)
+    def __getitem__(self, index):
+        # create a audio tensor
+        text, audio_data, audio_rate = self._read_audio(index)
+        audio_len = audio_data.shape[1] / audio_rate
+        # convert stero to single channel
+        if audio_data.shape[0] > 1:
+            # audio_data: [samples]
+            audio_data = (audio_data[0] + audio_data[1]) / 2
+        else:
+            audio_data = audio_data.squeeze(0)
+        # resample audio clip
+        if audio_rate != self.sampling_rate:
+            audio_data = torchaudio.functional.resample(audio_data, orig_freq=audio_rate, new_freq=self.sampling_rate)
+        audio_data = audio_data.unsqueeze(0)
+        audio_data = self._cut_or_randomcrop(audio_data)
+        data_dict = {
+            'text': text,
+            'waveform': audio_data,
+            'modality': 'audio_text'
+        }
+        return data_dict

data/datamodules.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from typing import Dict, List, Optional, NoReturn
+import torch
+import lightning.pytorch as pl
+from torch.utils.data import DataLoader
+from data.audiotext_dataset import AudioTextDataset
+class DataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        train_dataset: object,
+        batch_size: int,
+        num_workers: int
+    ):
+        r"""Data module. To get one batch of data:
+        code-block:: python
+            data_module.setup()
+            for batch_data_dict in data_module.train_dataloader():
+                print(batch_data_dict.keys())
+                break
+        Args:
+            train_sampler: Sampler object
+            train_dataset: Dataset object
+            num_workers: int
+            distributed: bool
+        """
+        super().__init__()
+        self._train_dataset = train_dataset
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+        self.collate_fn = collate_fn
+    def prepare_data(self):
+        # download, split, etc...
+        # only called on 1 GPU/TPU in distributed
+        pass
+    def setup(self, stage: Optional[str] = None) -> NoReturn:
+        r"""called on every device."""
+        # make assignments here (val/train/test split)
+        # called on every process in DDP
+        # SegmentSampler is used for selecting segments for training.
+        # On multiple devices, each SegmentSampler samples a part of mini-batch
+        # data.
+        self.train_dataset = self._train_dataset
+    def train_dataloader(self) -> torch.utils.data.DataLoader:
+        r"""Get train loader."""
+        train_loader = DataLoader(
+            dataset=self.train_dataset,
+            batch_size=self.batch_size,
+            collate_fn=self.collate_fn,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            persistent_workers=False,
+            shuffle=True
+        )
+        return train_loader
+    def val_dataloader(self):
+        # val_split = Dataset(...)
+        # return DataLoader(val_split)
+        pass
+    def test_dataloader(self):
+        # test_split = Dataset(...)
+        # return DataLoader(test_split)
+        pass
+    def teardown(self):
+        # clean up after fit or test
+        # called on every process in DDP
+        pass
+def collate_fn(list_data_dict):
+    r"""Collate mini-batch data to inputs and targets for training.
+    Args:
+        list_data_dict: e.g., [
+            {
+                'text': 'a sound of dog',
+                'waveform': (1, samples),
+                'modality': 'audio_text'
+            }
+            ...
+            ]
+    Returns:
+        data_dict: e.g.
+            'audio_text': {
+                'text': ['a sound of dog', ...]
+                'waveform': (batch_size, 1, samples)
+        }
+    """
+    at_list_data_dict = [data_dict for data_dict in list_data_dict if data_dict['modality']=='audio_text']
+    at_data_dict = {}
+    if len(at_list_data_dict) > 0:
+        for key in at_list_data_dict[0].keys():
+            at_data_dict[key] = [at_data_dict[key] for at_data_dict in at_list_data_dict]
+            if key == 'waveform':
+                at_data_dict[key] = torch.stack(at_data_dict[key])
+            elif key == 'text':
+                at_data_dict[key] = [text for text in at_data_dict[key]]
+    data_dict = {
+        'audio_text': at_data_dict
+    }
+    return data_dict

data/waveform_mixers.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import random
+import sre_compile
+import numpy as np
+import torch
+import torch.nn as nn
+import pyloudnorm as pyln
+class SegmentMixer(nn.Module):
+    def __init__(self, max_mix_num, lower_db, higher_db):
+        super(SegmentMixer, self).__init__()
+        self.max_mix_num = max_mix_num
+        self.loudness_param = {
+            'lower_db': lower_db,
+            'higher_db': higher_db,
+        }
+    def __call__(self, waveforms):
+        batch_size = waveforms.shape[0]
+        data_dict = {
+            'segment': [],
+            'mixture': [],
+        }
+        for n in range(0, batch_size):
+            segment = waveforms[n].clone()
+            # create zero tensors as the background template
+            noise = torch.zeros_like(segment)
+            mix_num = random.randint(2, self.max_mix_num)
+            assert mix_num >= 2
+            for i in range(1, mix_num):
+                next_segment = waveforms[(n + i) % batch_size]
+                rescaled_next_segment = dynamic_loudnorm(audio=next_segment, reference=segment, **self.loudness_param)
+                noise += rescaled_next_segment
+            # randomly normalize background noise
+            noise = dynamic_loudnorm(audio=noise, reference=segment, **self.loudness_param)
+            # create audio mixyure
+            mixture = segment + noise
+            # declipping if need be
+            max_value = torch.max(torch.abs(mixture))
+            if max_value > 1:
+                segment *= 0.9 / max_value
+                mixture *= 0.9 / max_value
+            data_dict['segment'].append(segment)
+            data_dict['mixture'].append(mixture)
+        for key in data_dict.keys():
+            data_dict[key] = torch.stack(data_dict[key], dim=0)
+        # return data_dict
+        return data_dict['mixture'], data_dict['segment']
+def rescale_to_match_energy(segment1, segment2):
+    ratio = get_energy_ratio(segment1, segment2)
+    rescaled_segment1 = segment1 / ratio
+    return rescaled_segment1
+def get_energy(x):
+    return torch.mean(x ** 2)
+def get_energy_ratio(segment1, segment2):
+    energy1 = get_energy(segment1)
+    energy2 = max(get_energy(segment2), 1e-10)
+    ratio = (energy1 / energy2) ** 0.5
+    ratio = torch.clamp(ratio, 0.02, 50)
+    return ratio
+def dynamic_loudnorm(audio, reference, lower_db=-10, higher_db=10):
+    rescaled_audio = rescale_to_match_energy(audio, reference)
+    delta_loudness = random.randint(lower_db, higher_db)
+    gain = np.power(10.0, delta_loudness / 20.0)
+    return gain * rescaled_audio
+def torch_to_numpy(tensor):
+    """Convert a PyTorch tensor to a NumPy array."""
+    if isinstance(tensor, torch.Tensor):
+        return tensor.detach().cpu().numpy()
+    else:
+        raise ValueError("Input must be a PyTorch tensor.")
+def numpy_to_torch(array):
+    """Convert a NumPy array to a PyTorch tensor."""
+    if isinstance(array, np.ndarray):
+        return torch.from_numpy(array)
+    else:
+        raise ValueError("Input must be a NumPy array.")
+# decayed
+def random_loudness_norm(audio, lower_db=-35, higher_db=-15, sr=32000):
+    device = audio.device
+    audio = torch_to_numpy(audio.squeeze(0))
+    # randomly select a norm volume
+    norm_vol = random.randint(lower_db, higher_db)
+    # measure the loudness first
+    meter = pyln.Meter(sr) # create BS.1770 meter
+    loudness = meter.integrated_loudness(audio)
+    # loudness normalize audio
+    normalized_audio = pyln.normalize.loudness(audio, loudness, norm_vol)
+    normalized_audio = numpy_to_torch(normalized_audio).unsqueeze(0)
+    return normalized_audio.to(device)

datafiles/template.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "data": [
+     {
+      "wav": "path_to_audio_file",
+      "caption": "textual_desciptions"
+     }
+    ]
+}

environment.yml ADDED Viewed

	@@ -0,0 +1,326 @@

+name: AudioSep
+channels:
+  - pytorch
+  - nvidia
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - backcall=0.2.0=pyhd3eb1b0_0
+  - blas=1.0=mkl
+  - boltons=23.0.0=py310h06a4308_0
+  - brotlipy=0.7.0=py310h7f8727e_1002
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.01.10=h06a4308_0
+  - certifi=2022.12.7=py310h06a4308_0
+  - cffi=1.15.1=py310h5eee18b_3
+  - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - comm=0.1.2=py310h06a4308_0
+  - conda=23.3.1=py310h06a4308_0
+  - conda-content-trust=0.1.3=py310h06a4308_0
+  - conda-package-handling=2.0.2=py310h06a4308_0
+  - conda-package-streaming=0.7.0=py310h06a4308_0
+  - cryptography=38.0.4=py310h9ce1e76_0
+  - cuda=11.6.1=0
+  - cuda-cccl=11.6.55=hf6102b2_0
+  - cuda-command-line-tools=11.6.2=0
+  - cuda-compiler=11.6.2=0
+  - cuda-cudart=11.6.55=he381448_0
+  - cuda-cudart-dev=11.6.55=h42ad0f4_0
+  - cuda-cuobjdump=11.6.124=h2eeebcb_0
+  - cuda-cupti=11.6.124=h86345e5_0
+  - cuda-cuxxfilt=11.6.124=hecbf4f6_0
+  - cuda-driver-dev=11.6.55=0
+  - cuda-gdb=12.1.55=0
+  - cuda-libraries=11.6.1=0
+  - cuda-libraries-dev=11.6.1=0
+  - cuda-memcheck=11.8.86=0
+  - cuda-nsight=12.1.55=0
+  - cuda-nsight-compute=12.1.0=0
+  - cuda-nvcc=11.6.124=hbba6d2d_0
+  - cuda-nvdisasm=12.1.55=0
+  - cuda-nvml-dev=11.6.55=haa9ef22_0
+  - cuda-nvprof=12.1.55=0
+  - cuda-nvprune=11.6.124=he22ec0a_0
+  - cuda-nvrtc=11.6.124=h020bade_0
+  - cuda-nvrtc-dev=11.6.124=h249d397_0
+  - cuda-nvtx=11.6.124=h0630a44_0
+  - cuda-nvvp=12.1.55=0
+  - cuda-runtime=11.6.1=0
+  - cuda-samples=11.6.101=h8efea70_0
+  - cuda-sanitizer-api=12.1.55=0
+  - cuda-toolkit=11.6.1=0
+  - cuda-tools=11.6.1=0
+  - cuda-visual-tools=11.6.1=0
+  - debugpy=1.5.1=py310h295c915_0
+  - decorator=5.1.1=pyhd3eb1b0_0
+  - flit-core=3.8.0=py310h06a4308_0
+  - freetype=2.12.1=h4a9f257_0
+  - gds-tools=1.6.0.25=0
+  - giflib=5.2.1=h5eee18b_3
+  - gmp=6.2.1=h295c915_3
+  - gnutls=3.6.15=he1e5248_0
+  - idna=3.4=py310h06a4308_0
+  - intel-openmp=2021.4.0=h06a4308_3561
+  - ipykernel=6.19.2=py310h2f386ee_0
+  - ipython=8.12.0=py310h06a4308_0
+  - jpeg=9e=h5eee18b_1
+  - jsonpatch=1.32=pyhd3eb1b0_0
+  - jsonpointer=2.1=pyhd3eb1b0_0
+  - jupyter_client=8.1.0=py310h06a4308_0
+  - jupyter_core=5.3.0=py310h06a4308_0
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libcublas=11.9.2.110=h5e84587_0
+  - libcublas-dev=11.9.2.110=h5c901ab_0
+  - libcufft=10.7.1.112=hf425ae0_0
+  - libcufft-dev=10.7.1.112=ha5ce4c0_0
+  - libcufile=1.6.0.25=0
+  - libcufile-dev=1.6.0.25=0
+  - libcurand=10.3.2.56=0
+  - libcurand-dev=10.3.2.56=0
+  - libcusolver=11.3.4.124=h33c3c4e_0
+  - libcusparse=11.7.2.124=h7538f96_0
+  - libcusparse-dev=11.7.2.124=hbbe9722_0
+  - libdeflate=1.17=h5eee18b_0
+  - libffi=3.4.2=h6a678d5_6
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h7f8727e_2
+  - libidn2=2.3.2=h7f8727e_0
+  - libnpp=11.6.3.124=hd2722f0_0
+  - libnpp-dev=11.6.3.124=h3c42840_0
+  - libnvjpeg=11.6.2.124=hd473ad6_0
+  - libnvjpeg-dev=11.6.2.124=hb5906b9_0
+  - libpng=1.6.39=h5eee18b_0
+  - libsodium=1.0.18=h7b6447c_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.0=h6a678d5_2
+  - libunistring=0.9.10=h27cfd23_0
+  - libuuid=1.41.5=h5eee18b_0
+  - libwebp=1.2.4=h11a3e52_1
+  - libwebp-base=1.2.4=h5eee18b_1
+  - lz4-c=1.9.4=h6a678d5_0
+  - matplotlib-inline=0.1.6=py310h06a4308_0
+  - mkl=2021.4.0=h06a4308_640
+  - mkl-service=2.4.0=py310h7f8727e_0
+  - mkl_fft=1.3.1=py310hd6ae3a3_0
+  - mkl_random=1.2.2=py310h00e6091_0
+  - ncurses=6.4=h6a678d5_0
+  - nest-asyncio=1.5.6=py310h06a4308_0
+  - nettle=3.7.3=hbbd107a_1
+  - nsight-compute=2023.1.0.15=0
+  - numpy=1.23.5=py310hd5efca6_0
+  - numpy-base=1.23.5=py310h8e6c178_0
+  - openh264=2.1.1=h4ff587b_0
+  - openssl=1.1.1t=h7f8727e_0
+  - packaging=23.0=py310h06a4308_0
+  - parso=0.8.3=pyhd3eb1b0_0
+  - pexpect=4.8.0=pyhd3eb1b0_3
+  - pickleshare=0.7.5=pyhd3eb1b0_1003
+  - pip=22.3.1=py310h06a4308_0
+  - platformdirs=2.5.2=py310h06a4308_0
+  - pluggy=1.0.0=py310h06a4308_1
+  - psutil=5.9.0=py310h5eee18b_0
+  - ptyprocess=0.7.0=pyhd3eb1b0_2
+  - pure_eval=0.2.2=pyhd3eb1b0_0
+  - pycosat=0.6.4=py310h5eee18b_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pyopenssl=22.0.0=pyhd3eb1b0_0
+  - pysocks=1.7.1=py310h06a4308_0
+  - python=3.10.9=h7a1cb2a_0
+  - python-dateutil=2.8.2=pyhd3eb1b0_0
+  - pytorch=1.13.1=py3.10_cuda11.6_cudnn8.3.2_0
+  - pytorch-cuda=11.6=h867d48c_1
+  - pytorch-mutex=1.0=cuda
+  - pyzmq=23.2.0=py310h6a678d5_0
+  - readline=8.2=h5eee18b_0
+  - requests=2.28.1=py310h06a4308_0
+  - ruamel.yaml=0.17.21=py310h5eee18b_0
+  - ruamel.yaml.clib=0.2.6=py310h5eee18b_1
+  - setuptools=65.6.3=py310h06a4308_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - sqlite=3.40.1=h5082296_0
+  - stack_data=0.2.0=pyhd3eb1b0_0
+  - tk=8.6.12=h1ccaba5_0
+  - toolz=0.12.0=py310h06a4308_0
+  - torchaudio=0.13.1=py310_cu116
+  - torchvision=0.14.1=py310_cu116
+  - tornado=6.2=py310h5eee18b_0
+  - tqdm=4.64.1=py310h06a4308_0
+  - typing_extensions=4.4.0=py310h06a4308_0
+  - tzdata=2022g=h04d1e81_0
+  - urllib3=1.26.14=py310h06a4308_0
+  - wheel=0.37.1=pyhd3eb1b0_0
+  - xz=5.2.10=h5eee18b_1
+  - zeromq=4.3.4=h2531618_0
+  - zlib=1.2.13=h5eee18b_0
+  - zstandard=0.18.0=py310h5eee18b_0
+  - zstd=1.5.4=hc292b87_0
+  - pip:
+      - absl-py==1.4.0
+      - aiohttp==3.8.4
+      - aiosignal==1.3.1
+      - anyio==3.6.2
+      - appdirs==1.4.4
+      - arrow==1.2.3
+      - asttokens==2.2.1
+      - async-generator==1.10
+      - async-timeout==4.0.2
+      - attrs==22.2.0
+      - audioread==3.0.0
+      - av==10.0.0
+      - beartype==0.12.0
+      - beautifulsoup4==4.12.2
+      - blessed==1.20.0
+      - braceexpand==0.1.7
+      - cachetools==5.3.0
+      - click==8.1.3
+      - contourpy==1.0.7
+      - croniter==1.3.10
+      - cycler==0.11.0
+      - dataclasses-json==0.5.8
+      - dateutils==0.6.12
+      - decord==0.6.0
+      - deepdiff==6.3.0
+      - dtk==0.2
+      - exceptiongroup==1.1.1
+      - executing==1.2.0
+      - fastapi==0.88.0
+      - ffmpeg==1.4
+      - ffmpeg-python==0.2.0
+      - filelock==3.12.0
+      - fonttools==4.39.3
+      - frozenlist==1.3.3
+      - fsspec==2023.4.0
+      - ftfy==6.1.1
+      - future==0.18.3
+      - gammatone==1.0
+      - google-auth==2.17.3
+      - google-auth-oauthlib==1.0.0
+      - greenlet==2.0.2
+      - grpcio==1.54.0
+      - h11==0.14.0
+      - h5py==3.8.0
+      - hickle==5.0.2
+      - huggingface-hub==0.14.1
+      - humanize==4.6.0
+      - imageio==2.27.0
+      - inquirer==3.1.3
+      - ipdb==0.13.13
+      - itsdangerous==2.1.2
+      - jedi==0.18.2
+      - jinja2==3.1.2
+      - joblib==1.2.0
+      - kiwisolver==1.4.4
+      - langchain==0.0.216
+      - langchainplus-sdk==0.0.17
+      - lazy-loader==0.2
+      - librosa==0.10.0.post2
+      - lightning==2.0.0
+      - lightning-cloud==0.5.33
+      - lightning-utilities==0.8.0
+      - llvmlite==0.39.1
+      - markdown==3.4.3
+      - markdown-it-py==2.2.0
+      - markupsafe==2.1.2
+      - marshmallow==3.19.0
+      - marshmallow-enum==1.5.1
+      - matplotlib==3.7.1
+      - mdurl==0.1.2
+      - mergedeep==1.3.4
+      - mock==5.0.2
+      - msgpack==1.0.5
+      - msgpack-numpy==0.4.8
+      - multidict==6.0.4
+      - musdb==0.4.0
+      - mypy-extensions==1.0.0
+      - networkx==3.1
+      - nose==1.3.7
+      - numba==0.56.4
+      - numexpr==2.8.4
+      - oauthlib==3.2.2
+      - openai==0.27.8
+      - openapi-schema-pydantic==1.2.4
+      - opencv-python==4.7.0.72
+      - ordered-set==4.1.0
+      - outcome==1.2.0
+      - pandas==1.5.3
+      - panns-inference==0.1.0
+      - pesq==0.0.4
+      - pillow==9.5.0
+      - pooch==1.6.0
+      - prompt-toolkit==3.0.38
+      - protobuf==4.22.3
+      - pyaml==23.5.9
+      - pyasn1==0.5.0
+      - pyasn1-modules==0.3.0
+      - pydantic==1.10.7
+      - pygments==2.14.0
+      - pyjwt==2.6.0
+      - pyloudnorm==0.1.1
+      - pyparsing==3.0.9
+      - pystoi==0.3.3
+      - python-editor==1.0.4
+      - python-multipart==0.0.6
+      - pytorch-ignite==0.3.0
+      - pytorch-lightning==2.0.1.post0
+      - pytz==2023.3
+      - pywavelets==1.4.1
+      - pyyaml==6.0
+      - readchar==4.0.5
+      - regex==2023.3.23
+      - requests-oauthlib==1.3.1
+      - resampy==0.4.2
+      - rich==13.3.3
+      - rsa==4.9
+      - scikit-image==0.20.0
+      - scikit-learn==1.2.2
+      - scipy==1.10.1
+      - selenium==4.8.3
+      - simplejpeg==1.6.6
+      - sniffio==1.3.0
+      - sortedcontainers==2.4.0
+      - soundfile==0.12.1
+      - soupsieve==2.4
+      - soxr==0.3.5
+      - sqlalchemy==2.0.17
+      - stack-data==0.6.2
+      - starlette==0.22.0
+      - starsessions==1.3.0
+      - stempeg==0.2.3
+      - tenacity==8.2.2
+      - tensorboard==2.12.2
+      - tensorboard-data-server==0.7.0
+      - tensorboard-plugin-wit==1.8.1
+      - termcolor==1.1.0
+      - threadpoolctl==3.1.0
+      - tifffile==2023.3.21
+      - timm==0.3.2
+      - tokenizers==0.13.3
+      - tomli==2.0.1
+      - torchfile==0.1.0
+      - torchlibrosa==0.1.0
+      - torchmetrics==0.11.4
+      - traitlets==5.9.0
+      - transformers==4.28.1
+      - trio==0.22.0
+      - trio-websocket==0.10.2
+      - typeguard==3.0.2
+      - typing-extensions==4.5.0
+      - typing-inspect==0.9.0
+      - uvicorn==0.21.1
+      - visdom==0.1.8.9
+      - wcwidth==0.2.6
+      - webdataset==0.2.48
+      - websocket-client==1.5.1
+      - websockets==11.0.1
+      - werkzeug==2.2.3
+      - wget==3.2
+      - wsproto==1.2.0
+      - yarl==1.8.2
+      - zenodo-get==1.3.4
+      - zsvision==0.7.8

evaluation/evaluate_audiocaps.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+import sys
+import re
+from typing import Dict, List
+import csv
+import pandas as pd
+import numpy as np
+import torch
+from tqdm import tqdm
+import pathlib
+import librosa
+import lightning.pytorch as pl
+from models.clap_encoder import CLAP_Encoder
+sys.path.append('../AudioSep/')
+from utils import (
+    load_ss_model,
+    calculate_sdr,
+    calculate_sisdr,
+    parse_yaml,
+    get_mean_sdr_from_dict,
+)
+class AudioCapsEvaluator:
+    def __init__(
+        self,
+        query='caption',
+        sampling_rate=32000,
+    ) -> None:
+        r"""AudioCaps evaluator.
+        Args:
+            query (str): type of query, 'caption' or 'labels'
+        Returns:
+            None
+        """
+        self.query = query
+        self.sampling_rate = sampling_rate
+        with open(f'evaluation/metadata/audiocaps_eval.csv') as csv_file:
+            csv_reader = csv.reader(csv_file, delimiter=',')
+            eval_list = [row for row in csv_reader][1:]
+        self.eval_list = eval_list
+        self.audio_dir = f'evaluation/data/audiocaps'
+    def __call__(
+        self,
+        pl_model: pl.LightningModule
+    ) -> Dict:
+        r"""Evalute."""
+        print(f'Evaluation on AudioCaps with [{self.query}] queries.')
+        pl_model.eval()
+        device = pl_model.device
+        sisdrs_list = []
+        sdris_list = []
+        with torch.no_grad():
+            for eval_data in tqdm(self.eval_list):
+                idx, caption, labels, _, _ = eval_data
+                source_path = os.path.join(self.audio_dir, f'segment-{idx}.wav')
+                mixture_path = os.path.join(self.audio_dir, f'mixture-{idx}.wav')
+                source, fs = librosa.load(source_path, sr=self.sampling_rate, mono=True)
+                mixture, fs = librosa.load(mixture_path, sr=self.sampling_rate, mono=True)
+                sdr_no_sep = calculate_sdr(ref=source, est=mixture)
+                if self.query == 'caption':
+                    text = [caption]
+                elif self.query == 'labels':
+                    text = [labels]
+                conditions = pl_model.query_encoder.get_query_embed(
+                    modality='text',
+                    text=text,
+                    device=device
+                )
+                input_dict = {
+                    "mixture": torch.Tensor(mixture)[None, None, :].to(device),
+                    "condition": conditions,
+                }
+                sep_segment = pl_model.ss_model(input_dict)["waveform"]
+                    # sep_segment: (batch_size=1, channels_num=1, segment_samples)
+                sep_segment = sep_segment.squeeze(0).squeeze(0).data.cpu().numpy()
+                    # sep_segment: (segment_samples,)
+                sdr = calculate_sdr(ref=source, est=sep_segment)
+                sdri = sdr - sdr_no_sep
+                sisdr = calculate_sisdr(ref=source, est=sep_segment)
+                sisdrs_list.append(sisdr)
+                sdris_list.append(sdri)
+        mean_sisdr = np.mean(sisdrs_list)
+        mean_sdri = np.mean(sdris_list)
+        return mean_sisdr, mean_sdri

evaluation/evaluate_audioset.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import os
+import sys
+import re
+from typing import Dict, List
+import pandas as pd
+import numpy as np
+import torch
+from tqdm import tqdm
+import pathlib
+import librosa
+import lightning.pytorch as pl
+from models.clap_encoder import CLAP_Encoder
+sys.path.append('../AudioSep/')
+from utils import (
+    load_ss_model,
+    calculate_sdr,
+    calculate_sisdr,
+    parse_yaml,
+    get_mean_sdr_from_dict,
+)
+meta_csv_file = "evaluation/metadata/class_labels_indices.csv"
+df = pd.read_csv(meta_csv_file, sep=',')
+IDS = df['mid'].tolist()
+LABELS = df['display_name'].tolist()
+CLASSES_NUM = len(LABELS)
+IX_TO_LB = {i : label for i, label in enumerate(LABELS)}
+class AudioSetEvaluator:
+    def __init__(
+        self,
+        audios_dir='evaluation/data/audioset',
+        classes_num=527,
+        sampling_rate=32000,
+        number_per_class=10,
+    ) -> None:
+        r"""AudioSet evaluator.
+        Args:
+            audios_dir (str): directory of evaluation segments
+            classes_num (int): the number of sound classes
+            number_per_class (int), the number of samples to evaluate for each sound class
+        Returns:
+            None
+        """
+        self.audios_dir = audios_dir
+        self.classes_num = classes_num
+        self.number_per_class = number_per_class
+        self.sampling_rate = sampling_rate
+    @torch.no_grad()
+    def __call__(
+        self,
+        pl_model: pl.LightningModule
+    ) -> Dict:
+        r"""Evalute."""
+        pl_model.eval()
+        sisdrs_dict = {class_id: [] for class_id in range(self.classes_num)}
+        sdris_dict = {class_id: [] for class_id in range(self.classes_num)}
+        print('Evaluation on AudioSet with [text label] queries.')
+        for class_id in tqdm(range(self.classes_num)):
+            sub_dir = os.path.join(
+                self.audios_dir,
+                "class_id={}".format(class_id))
+            audio_names = self._get_audio_names(audios_dir=sub_dir)
+            for audio_index, audio_name in enumerate(audio_names):
+                if audio_index == self.number_per_class:
+                    break
+                source_path = os.path.join(
+                    sub_dir, "{},source.wav".format(audio_name))
+                mixture_path = os.path.join(
+                    sub_dir, "{},mixture.wav".format(audio_name))
+                source, fs = librosa.load(source_path, sr=self.sampling_rate, mono=True)
+                mixture, fs = librosa.load(mixture_path, sr=self.sampling_rate, mono=True)
+                sdr_no_sep = calculate_sdr(ref=source, est=mixture)
+                device = pl_model.device
+                text = [IX_TO_LB[class_id]]
+                conditions = pl_model.query_encoder.get_query_embed(
+                    modality='text',
+                    text=text,
+                    device=device
+                )
+                input_dict = {
+                    "mixture": torch.Tensor(mixture)[None, None, :].to(device),
+                    "condition": conditions,
+                }
+                sep_segment = pl_model.ss_model(input_dict)["waveform"]
+                # sep_segment: (batch_size=1, channels_num=1, segment_samples)
+                sep_segment = sep_segment.squeeze(0).squeeze(0).data.cpu().numpy()
+                # sep_segment: (segment_samples,)
+                sdr = calculate_sdr(ref=source, est=sep_segment)
+                sdri = sdr - sdr_no_sep
+                sisdr = calculate_sisdr(ref=source, est=sep_segment)
+                sisdrs_dict[class_id].append(sisdr)
+                sdris_dict[class_id].append(sdri)
+        stats_dict = {
+            "sisdrs_dict": sisdrs_dict,
+            "sdris_dict": sdris_dict,
+        }
+        return stats_dict
+    def _get_audio_names(self, audios_dir: str) -> List[str]:
+        r"""Get evaluation audio names."""
+        audio_names = sorted(os.listdir(audios_dir))
+        audio_names = [audio_name for audio_name in audio_names if '.wav' in audio_name]
+        audio_names = [
+            re.search(
+                "(.*),(mixture|source).wav",
+                audio_name).group(1) for audio_name in audio_names]
+        audio_names = sorted(list(set(audio_names)))
+        return audio_names
+    @staticmethod
+    def get_median_metrics(stats_dict, metric_type):
+        class_ids = stats_dict[metric_type].keys()
+        median_stats_dict = {
+            class_id: np.nanmedian(
+                stats_dict[metric_type][class_id]) for class_id in class_ids}
+        return median_stats_dict

evaluation/evaluate_clotho.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import sys
+import re
+from typing import Dict, List
+import csv
+import pandas as pd
+import numpy as np
+import torch
+from tqdm import tqdm
+import pathlib
+import librosa
+import lightning.pytorch as pl
+from models.clap_encoder import CLAP_Encoder
+sys.path.append('../AudioSep/')
+from utils import (
+    load_ss_model,
+    calculate_sdr,
+    calculate_sisdr,
+    parse_yaml,
+    get_mean_sdr_from_dict,
+)
+class ClothoEvaluator:
+    def __init__(
+        self,
+        sampling_rate=32000,
+    ) -> None:
+        r"""Clotho evaluator.
+        Returns:
+            None
+        """
+        self.sampling_rate = sampling_rate
+        with open('evaluation/metadata/clotho_eval.csv') as csv_file:
+            csv_reader = csv.reader(csv_file, delimiter=',')
+            eval_list = [row for row in csv_reader][1:]
+        self.eval_list = eval_list
+        self.audio_dir = 'evaluation/data/clotho'
+    def __call__(
+        self,
+        pl_model: pl.LightningModule
+    ) -> Dict:
+        r"""Evalute."""
+        print(f'Evaluation on Clotho Evaluation with [caption] queries.')
+        pl_model.eval()
+        device = pl_model.device
+        sisdrs_list = []
+        sdris_list = []
+        with torch.no_grad():
+            for eval_data in tqdm(self.eval_list):
+                idx, caption, _, _, _ = eval_data
+                source_path = os.path.join(self.audio_dir, f'segment-{idx}.wav')
+                mixture_path = os.path.join(self.audio_dir, f'mixture-{idx}.wav')
+                source, fs = librosa.load(source_path, sr=self.sampling_rate, mono=True)
+                mixture, fs = librosa.load(mixture_path, sr=self.sampling_rate, mono=True)
+                sdr_no_sep = calculate_sdr(ref=source, est=mixture)
+                text = [caption]
+                conditions = pl_model.query_encoder.get_query_embed(
+                    modality='text',
+                    text=text,
+                    device=device
+                )
+                input_dict = {
+                    "mixture": torch.Tensor(mixture)[None, None, :].to(device),
+                    "condition": conditions,
+                }
+                sep_segment = pl_model.ss_model(input_dict)["waveform"]
+                # sep_segment: (batch_size=1, channels_num=1, segment_samples)
+                sep_segment = sep_segment.squeeze(0).squeeze(0).data.cpu().numpy()
+                # sep_segment: (segment_samples,)
+                sdr = calculate_sdr(ref=source, est=sep_segment)
+                sdri = sdr - sdr_no_sep
+                sisdr = calculate_sisdr(ref=source, est=sep_segment)
+                sisdrs_list.append(sisdr)
+                sdris_list.append(sdri)
+        mean_sisdr = np.mean(sisdrs_list)
+        mean_sdri = np.mean(sdris_list)
+        return mean_sisdr, mean_sdri

evaluation/evaluate_esc50.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import sys
+import re
+from typing import Dict, List
+import csv
+import pandas as pd
+import numpy as np
+import torch
+from tqdm import tqdm
+import pathlib
+import librosa
+import lightning.pytorch as pl
+from models.clap_encoder import CLAP_Encoder
+sys.path.append('../AudioSep/')
+from utils import (
+    load_ss_model,
+    calculate_sdr,
+    calculate_sisdr,
+    parse_yaml,
+    get_mean_sdr_from_dict,
+)
+class ESC50Evaluator:
+    def __init__(
+        self,
+        sampling_rate=32000
+    ) -> None:
+        r"""ESC-50 evaluator.
+        Returns:
+            None
+        """
+        self.sampling_rate = sampling_rate
+        with open('evaluation/metadata/esc50_eval.csv') as csv_file:
+            csv_reader = csv.reader(csv_file, delimiter=',')
+            eval_list = [row for row in csv_reader][1:]
+        self.eval_list = eval_list
+        self.audio_dir = 'evaluation/data/esc50'
+    def __call__(
+        self,
+        pl_model: pl.LightningModule
+    ) -> Dict:
+        r"""Evalute."""
+        print(f'Evaluation on ESC-50 with [text label] queries.')
+        pl_model.eval()
+        device = pl_model.device
+        sisdrs_list = []
+        sdris_list = []
+        with torch.no_grad():
+            for eval_data in tqdm(self.eval_list):
+                idx, caption, _, _, = eval_data
+                source_path = os.path.join(self.audio_dir, f'segment-{idx}.wav')
+                mixture_path = os.path.join(self.audio_dir, f'mixture-{idx}.wav')
+                source, fs = librosa.load(source_path, sr=self.sampling_rate, mono=True)
+                mixture, fs = librosa.load(mixture_path, sr=self.sampling_rate, mono=True)
+                sdr_no_sep = calculate_sdr(ref=source, est=mixture)
+                text = [caption]
+                conditions = pl_model.query_encoder.get_query_embed(
+                    modality='text',
+                    text=text,
+                    device=device
+                )
+                input_dict = {
+                    "mixture": torch.Tensor(mixture)[None, None, :].to(device),
+                    "condition": conditions,
+                }
+                sep_segment = pl_model.ss_model(input_dict)["waveform"]
+                    # sep_segment: (batch_size=1, channels_num=1, segment_samples)
+                sep_segment = sep_segment.squeeze(0).squeeze(0).data.cpu().numpy()
+                    # sep_segment: (segment_samples,)
+                sdr = calculate_sdr(ref=source, est=sep_segment)
+                sdri = sdr - sdr_no_sep
+                sisdr = calculate_sisdr(ref=source, est=sep_segment)
+                sisdrs_list.append(sisdr)
+                sdris_list.append(sdri)
+        mean_sdri = np.mean(sdris_list)
+        mean_sisdr = np.mean(sisdrs_list)
+        return mean_sisdr, mean_sdri

evaluation/evaluate_music.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+import sys
+import re
+from typing import Dict, List
+import csv
+import pandas as pd
+import numpy as np
+import torch
+from tqdm import tqdm
+import pathlib
+import librosa
+import lightning.pytorch as pl
+from models.clap_encoder import CLAP_Encoder
+sys.path.append('../AudioSep/')
+from utils import (
+    load_ss_model,
+    calculate_sdr,
+    calculate_sisdr,
+    parse_yaml,
+    get_mean_sdr_from_dict,
+)
+class MUSICEvaluator:
+    def __init__(
+        self,
+        sampling_rate=32000
+    ) -> None:
+        self.sampling_rate = sampling_rate
+        with open('evaluation/metadata/music_eval.csv') as csv_file:
+            csv_reader = csv.reader(csv_file, delimiter=',')
+            eval_list = [row for row in csv_reader][1:]
+        self.eval_list = eval_list
+        self.audio_dir = 'evaluation/data/music'
+        self.source_types = [
+        "acoustic guitar",
+        "violin",
+        "accordion",
+        "xylophone",
+        "erhu",
+        "trumpet",
+        "tuba",
+        "cello",
+        "flute",
+        "saxophone"]
+    def __call__(
+        self,
+        pl_model: pl.LightningModule
+    ) -> Dict:
+        r"""Evalute."""
+        print(f'Evaluation on MUSIC Test with [text label] queries.')
+        pl_model.eval()
+        device = pl_model.device
+        sisdrs_list = {source_type: [] for source_type in self.source_types}
+        sdris_list = {source_type: [] for source_type in self.source_types}
+        with torch.no_grad():
+            for eval_data in tqdm(self.eval_list):
+                idx, caption, _, _, = eval_data
+                source_path = os.path.join(self.audio_dir, f'segment-{idx}.wav')
+                mixture_path = os.path.join(self.audio_dir, f'mixture-{idx}.wav')
+                source, fs = librosa.load(source_path, sr=self.sampling_rate, mono=True)
+                mixture, fs = librosa.load(mixture_path, sr=self.sampling_rate, mono=True)
+                sdr_no_sep = calculate_sdr(ref=source, est=mixture)
+                text = [caption]
+                conditions = pl_model.query_encoder.get_query_embed(
+                    modality='text',
+                    text=text,
+                    device=device
+                )
+                input_dict = {
+                    "mixture": torch.Tensor(mixture)[None, None, :].to(device),
+                    "condition": conditions,
+                }
+                sep_segment = pl_model.ss_model(input_dict)["waveform"]
+                    # sep_segment: (batch_size=1, channels_num=1, segment_samples)
+                sep_segment = sep_segment.squeeze(0).squeeze(0).data.cpu().numpy()
+                    # sep_segment: (segment_samples,)
+                sdr = calculate_sdr(ref=source, est=sep_segment)
+                sdri = sdr - sdr_no_sep
+                sisdr = calculate_sisdr(ref=source, est=sep_segment)
+                sisdrs_list[caption].append(sisdr)
+                sdris_list[caption].append(sdri)
+        mean_sisdr_list = []
+        mean_sdri_list = []
+        for source_class in self.source_types:
+            sisdr = np.mean(sisdrs_list[source_class])
+            sdri = np.mean(sdris_list[source_class])
+            mean_sisdr_list.append(sisdr)
+            mean_sdri_list.append(sdri)
+        mean_sdri = np.mean(mean_sdri_list)
+        mean_sisdr = np.mean(mean_sisdr_list)
+        return mean_sisdr, mean_sdri

evaluation/evaluate_vggsound.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+import sys
+import re
+from typing import Dict, List
+import csv
+import pandas as pd
+import numpy as np
+import torch
+from tqdm import tqdm
+import pathlib
+import librosa
+import lightning.pytorch as pl
+from models.clap_encoder import CLAP_Encoder
+sys.path.append('../AudioSep/')
+from utils import (
+    load_ss_model,
+    calculate_sdr,
+    calculate_sisdr,
+    parse_yaml,
+    get_mean_sdr_from_dict,
+)
+class VGGSoundEvaluator:
+    def __init__(
+        self,
+        sampling_rate=32000
+    ) -> None:
+        r"""VGGSound evaluator.
+        Args:
+            data_recipe (str): dataset split, 'yan'
+        Returns:
+            None
+        """
+        self.sampling_rate = sampling_rate
+        with open('evaluation/metadata/vggsound_eval.csv') as csv_file:
+            csv_reader = csv.reader(csv_file, delimiter=',')
+            eval_list = [row for row in csv_reader][1:]
+        self.eval_list = eval_list
+        self.audio_dir = 'evaluation/data/vggsound'
+    def __call__(
+        self,
+        pl_model: pl.LightningModule
+    ) -> Dict:
+        r"""Evalute."""
+        print(f'Evaluation on VGGSound+ with [text label] queries.')
+        pl_model.eval()
+        device = pl_model.device
+        sisdrs_list = []
+        sdris_list = []
+        sisdris_list = []
+        with torch.no_grad():
+            for eval_data in tqdm(self.eval_list):
+                # labels, source_path, mixture_path = eval_data
+                file_id, mix_wav, s0_wav, s0_text, s1_wav, s1_text = eval_data
+                labels = s0_text
+                mixture_path = os.path.join(self.audio_dir, mix_wav)
+                source_path = os.path.join(self.audio_dir, s0_wav)
+                source, fs = librosa.load(source_path, sr=self.sampling_rate, mono=True)
+                mixture, fs = librosa.load(mixture_path, sr=self.sampling_rate, mono=True)
+                sdr_no_sep = calculate_sdr(ref=source, est=mixture)
+                text = [labels]
+                conditions = pl_model.query_encoder.get_query_embed(
+                    modality='text',
+                    text=text,
+                    device=device
+                )
+                input_dict = {
+                    "mixture": torch.Tensor(mixture)[None, None, :].to(device),
+                    "condition": conditions,
+                }
+                sep_segment = pl_model.ss_model(input_dict)["waveform"]
+                    # sep_segment: (batch_size=1, channels_num=1, segment_samples)
+                sep_segment = sep_segment.squeeze(0).squeeze(0).data.cpu().numpy()
+                    # sep_segment: (segment_samples,)
+                sdr = calculate_sdr(ref=source, est=sep_segment)
+                sdri = sdr - sdr_no_sep
+                sisdr_no_sep = calculate_sisdr(ref=source, est=mixture)
+                sisdr = calculate_sisdr(ref=source, est=sep_segment)
+                sisdri = sisdr - sisdr_no_sep
+                sisdrs_list.append(sisdr)
+                sdris_list.append(sdri)
+                sisdris_list.append(sisdri)
+        mean_sisdr = np.mean(sisdrs_list)
+        mean_sdri = np.mean(sdris_list)
+        return mean_sisdr, mean_sdri

evaluation/metadata/audiocaps_eval.csv ADDED Viewed