Spaces:

bpiyush
/

SoundOfWater

Running

+"""Defines the audio model for pitch estimation."""
+import torch
+import torch.nn as nn
+import einops
+import math
+import numpy as np
+import einops
+import pytorch_lightning as pl
+import shared.utils as su
+class TimeEncodingDiscreteSinusoidal(nn.Module):
+    def __init__(self, d, v=10000, rate=49, scale_factor=0.01):
+        """
+        Args:
+            d (int): Dimension
+            rate (int): discretisation rate (frames per second)
+                this means that each [1/49.] of a second will be
+                encoded with a unique vector
+        """
+        super().__init__()
+        self.d = d
+        self.rate = rate
+        self.v = v
+        self.scale_factor = scale_factor
+    def forward(self, t):
+        """
+        Takes in timestamps t (seconds) and outputs vectors that represent these.
+        Args:
+            t (torch.tensor): time stamps in seconds, [B, N]
+        """
+        B, N = t.shape
+        # Discretise time
+        i = (t * self.rate).to(int)
+        pe = torch.zeros(B, N, self.d).to(t.device)
+        div_term = torch.exp(
+            (torch.arange(0, self.d, 2, dtype=torch.float) * -(math.log(self.v) / self.d))
+        )
+        div_term = div_term.to(t.device)
+        pe[:, :, 0::2] = torch.sin(i[:, :, None].float() * div_term)
+        pe[:, :, 1::2] = torch.cos(i[:, :, None].float() * div_term)
+        pe = pe * self.scale_factor
+        return pe
+class Wav2Vec2WithTimeEncoding(nn.Module):
+    def __init__(
+            self, model_name="facebook/wav2vec2-base-960h", use_time=True,
+            d=512, v=10000, rate=49, scale_factor=0.01, layer_norm=False,
+        ):
+        super().__init__()
+        su.log.print_update(
+            f" [:::] Loading backbone Wav2Vec 2.0 ",
+            pos="left",
+            fillchar=".",
+            color="cyan",
+        )
+        # Load pre-trained Wav2Vec 2.0 model
+        from transformers import Wav2Vec2Model
+        self.net = Wav2Vec2Model.from_pretrained(model_name)
+        self.d = d
+        self.v = v
+        self.rate = rate
+        self.sr = 16000
+        self.use_time = use_time
+        if self.use_time:
+            self.time_encoding = TimeEncodingDiscreteSinusoidal(
+                d=d, v=v, rate=rate, scale_factor=scale_factor,
+            )
+        else:
+            print(" [:::] Not using time encoding.")
+            self.time_encoding = None
+        # Have a layer norm for the time encoding
+        if layer_norm:
+            self.layer_norm = nn.LayerNorm(d)
+        else:
+            self.layer_norm = nn.Identity()
+    def forward(self, x, t):
+        """
+        Args:
+            x (torch.tensor): audio input, [B, NC, C, NS],
+                NC: n.o. clips, NS: n.o. samples
+            t (torch.tensor): time stamps in seconds, [B, NC, 2],
+                start and end times for each clip
+        """
+        B, T, C, NS = x.shape
+        assert C == 1, "Require a single-channel input."
+        assert t.shape[1] == T, \
+            "Number of timestamps should match number of clips."
+        assert t.shape[0] == B, \
+            "Batch size should match."
+        assert t.shape[2] == 2, \
+            "Timestamps should have start and end times."
+        # # Compute number of frames
+        # NF = int((NS / self.sr) * self.rate)
+        # Process inputs
+        x = einops.rearrange(x, "B T 1 NS -> (B T) NS")
+        t = einops.rearrange(t, "B T L -> (B T) L")
+        # This forward is based on Huggingface's implementation of Wave2Vec2
+        # https://github.com/huggingface/transformers/blob/main/src/
+        # transformers/models/wav2vec2/modeling_wav2vec2.py
+        # Encode through the CNN
+        extract_features = self.net.feature_extractor(x)
+        extract_features = extract_features.transpose(1, 2)
+        if self.use_time:
+            # Process timestamps: get timestamps for each frame
+            # within each clip (fps=49)
+            NF = extract_features.shape[1]
+            t_dense = []
+            for i in range(B):
+                start, end = t[i]
+                t_dense.append(torch.linspace(start, end, NF))
+            t_dense = torch.stack(t_dense).to(extract_features.device)
+            # Add time encoding to the features
+            t_dense_enc = self.time_encoding(t_dense)
+            # Normalise time encoding to have the same scale as the features
+            extract_features = extract_features + t_dense_enc
+        else:
+            pass
+        # Apply layer norm
+        extract_features = self.layer_norm(extract_features)
+        # Project into the feature space
+        hidden_states, extract_features = self.net.feature_projection(
+            extract_features
+        )
+        # Pass through the transformer encoder
+        encoder_outputs = self.net.encoder(
+            hidden_states,
+            attention_mask=None,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+        )
+        z = encoder_outputs[0]
+        # z = self.backbone(x).last_hidden_state
+        z = einops.rearrange(z, "(B T) F D -> B T F D", B=B, T=T)
+        return z
+def recursive_attr(module, attr):
+    if "." in attr:
+        m, a = attr.split(".", 1)
+        return recursive_attr(getattr(module, m), a)
+    return getattr(module, attr)
+class WavelengthWithTime(pl.LightningModule):
+    def __init__(
+            self,
+            backbone,
+            feat_dim=768,
+            axial=True,
+            axial_bins=512,
+            radial=True,
+            radial_bins=512,
+            freeze_backbone=True,
+            train_backbone_modules=[10, 11],
+            prediction_head_hidden=[],
+            act="softmax",
+            criterion="kl_div",
+            cfg_opt=dict(name="Adam", args=dict(lr=1e-4)),
+        ):
+        super().__init__()
+        su.log.print_update(
+            " [:::] Loading model WavelengthWithTime ",
+            color="cyan",
+            pos="left",
+            fillchar=".",
+        )
+        # By default, freeze the entire backbone
+        if freeze_backbone:
+            self.freeze(backbone)
+        # Unfreeze specific modules
+        train_backbone_modules = [
+            backbone.net.encoder.layers[int(m)] for m in train_backbone_modules
+        ]
+        for module in train_backbone_modules:
+            self.unfreeze(module)
+        # Make the layer norm in backbone trainable
+        print("[>>>] Unfreezing layer norm in backbone")
+        for param in backbone.layer_norm.parameters():
+            param.requires_grad = True
+        su.misc.num_trainable_params(backbone)
+        self.backbone = backbone
+        self.feat_dim = feat_dim
+        # Add some intermediate layers before prediction heads
+        if len(prediction_head_hidden) > 0:
+            layers = []
+            in_dim = feat_dim
+            for out_dim in prediction_head_hidden:
+                layers.append(nn.Linear(in_dim, out_dim))
+                layers.append(nn.ReLU())
+                in_dim = out_dim
+            self.intermediate_layers = nn.Sequential(*layers)
+        else:
+            self.intermediate_layers = torch.nn.Identity()
+            out_dim = feat_dim
+        su.misc.num_trainable_params(self.intermediate_layers)
+        assert axial or radial, \
+            "At least one of axial or radial heads must be enabled."
+        # Define axial head
+        self.axial_head = None
+        if axial:
+            self.axial_head = nn.Linear(out_dim, axial_bins)
+            su.misc.num_trainable_params(self.axial_head)
+        # Define radial head
+        self.radial_head = None
+        if radial:
+            self.radial_head = nn.Linear(out_dim, radial_bins)
+            su.misc.num_trainable_params(self.radial_head)
+        self.act = torch.nn.Softmax(dim=-1) if act == "softmax" else torch.nn.Identity()
+        # Set criterion
+        self.define_criterion(criterion)
+        # Define optimization config
+        self.cfg_opt = cfg_opt
+        # Save hyperparameters
+        self.save_hyperparameters(ignore=["backbone"])
+    def freeze_backbone(self):
+        for param in self.backbone.parameters():
+            param.requires_grad = False
+    def define_criterion(self, criterion):
+        if criterion == "kl_div":
+            self.criterion = nn.KLDivLoss()
+        elif criterion == "ce":
+            self.criterion = nn.CrossEntropyLoss()
+        else:
+            raise NotImplementedError(f"Criterion {criterion} not implemented.")
+    def freeze(self, net):
+        for p in net.parameters():
+            p.requires_grad = False
+    def unfreeze(self, module):
+        module_name = type(module).__name__
+        print(f"[>>>] Unfreezing {module_name}")
+        for p in module.parameters():
+            p.requires_grad = True
+    def forward(self, x, t):
+        """
+        Args:
+            x (torch.Tensor): [B, T, C, NS], T: n.o. clips
+            t (torch.Tensor): [B, T, 2], clip start and end times
+        """
+        B, T, C, NS = x.shape
+        z = self.backbone.forward(x, t)
+        # assert C == 1, "Require a single-channel input."
+        # x = einops.rearrange(x, "B T 1 NS -> (B T) NS")
+        # z = self.backbone(x).last_hidden_state
+        # z = einops.rearrange(z, "(B T) F D -> B T F D", B=B, D=self.feat_dim)
+        # Intermediate layers
+        h = self.intermediate_layers(z)
+        # Prediction heads
+        y_pred = dict()
+        if self.axial_head is not None:
+            axial = self.act(self.axial_head(h))
+            y_pred["axial"] = axial
+        if self.radial_head is not None:
+            radial = self.act(self.radial_head(h))
+            y_pred["radial"] = radial
+        return y_pred
+    def compute_loss(self, y_pred: dict, y_true: dict):
+        loss = dict()
+        total_loss = 0.
+        for key in y_pred:
+            yt = y_true[key]
+            yt = einops.rearrange(yt, "b t d f -> b t f d")
+            yp = y_pred[key]
+            if isinstance(self.criterion, nn.KLDivLoss):
+                # Need to pass log to the loss function if it is KLDivLoss
+                yp = yp.log()
+                loss[key] = self.criterion(yp, yt)
+            elif isinstance(self.criterion, nn.CrossEntropyLoss):
+                yp = einops.rearrange(yp, "b t f d -> (b t f) d")
+                yt = einops.rearrange(yt, "b t f d -> (b t f) d")
+                loss[key] = self.criterion(yp, yt)
+            else:
+                raise NotImplementedError(f"Criterion {self.criterion} not implemented.")
+            # For now, using hardcoded loss weights of 1/K where K is number of losses
+            total_loss += loss[key] / len(y_pred)
+        loss["total"] = total_loss
+        return loss
+    # Fill in the rest of the class definition here
+    def step(self, batch, mode, log=True):
+        x = batch["audio_clips"]
+        t = batch["clips"]
+        y_true = {**batch["targets"], **batch["metadata"]}
+        y_pred = self.forward(x, t)
+        losses = self.compute_loss(y_pred, y_true)
+        loss = losses["total"]
+        if log:
+            self.log(f"batch/{mode}/loss_net", loss, prog_bar=True, sync_dist=True)
+        return loss
+    def training_step(self, batch, batch_idx):
+        return self.step(batch, "train")
+    def validation_step(self, batch, batch_idx):
+        return self.step(batch, "valid")
+    def configure_optimizers(self):
+        function = getattr(torch.optim, self.cfg_opt["name"])
+        optimizer = function(self.parameters(), **self.cfg_opt["args"])
+        return optimizer
+if __name__ == "__main__":
+    import os
+    # Test backbone
+    backbone = Wav2Vec2WithTimeEncoding()
+    su.misc.num_params(backbone)
+    # Test on a real audio clip
+    path = "./media_assets/pouring_water_in_a_glass.wav"
+    import torchaudio
+    waveform, sr = torchaudio.load(path)
+    waveform = torchaudio.functional.resample(waveform, sr, 16000)
+    sr = 16000
+    waveform = waveform.mean(dim=0, keepdim=True)
+    # Forward pass an entire audio
+    from transformers import Wav2Vec2Processor
+    model_name = "facebook/wav2vec2-base-960h"
+    processor = Wav2Vec2Processor.from_pretrained(model_name)
+    s, e = 8, 22
+    x = processor(
+        waveform[:, int(s*sr):int(e*sr)], sampling_rate=16000, return_tensors="pt",
+    ).input_values.unsqueeze(0)
+    duration = waveform.shape[-1] / sr
+    t = torch.tensor([[s, e]]).unsqueeze(0)
+    z = backbone(x, t)
+    # Let's look at the tsne
+    z_flat = einops.rearrange(z, "B T F D -> (B T F) D")
+    import matplotlib.pyplot as plt
+    # Add serif
+    plt.rcParams["font.family"] = "serif"
+    su.visualize.show_temporal_tsne(z_flat.detach().numpy(), show=False)
+    plt.savefig("./media_assets/tsne.png")
+    plt.close()
+    # Test model
+    cfg_model = {
+        "name": "WavelengthWithTime",
+        "args": {
+            "axial": True,
+            "axial_bins": 64,
+            "radial": True,
+            "radial_bins": 64,
+            "freeze_backbone": True,
+            "train_backbone_modules": [6, 7, 8, 9, 10, 11],
+            "act": "softmax",
+            "criterion": "kl_div",
+        }
+    }
+    model = eval(cfg_model["name"])(backbone=backbone, **cfg_model["args"])
+    su.misc.num_trainable_params(model)
+    # Load pre-trained checkpoint
+    ckpt_dir = "/work/piyush/pretrained_checkpoints/SoundOfWater"
+    ckpt_path = os.path.join(
+        ckpt_dir,
+        "dsr9mf13_ep100_step12423_real_finetuned_with_cosupervision.pth",
+    )
+    assert os.path.exists(ckpt_path), \
+        f"Checkpoint not found at {ckpt_path}."
+    print("Loading checkpoint from: ", ckpt_path)
+    ckpt = torch.load(ckpt_path, map_location="cpu")
+    msg = model.load_state_dict(ckpt)
+    print(msg)
+    # Check forward pass
+    x_random = torch.randn(2, 1, 1, 16000)
+    t_random = torch.tensor([[[0, 1]], [[2, 3]]])
+    y_pred = model(x_random, t_random)
+    print("Input: ", x_random.shape)
+    for key in y_pred:
+        print(key, y_pred[key].shape)
+    # Plot features with the trained backbone and save as tsne_trained.png
+    z = model.backbone(x, t)
+    z_flat = einops.rearrange(z, "B T F D -> (B T F) D")
+    su.visualize.show_temporal_tsne(z_flat.detach().numpy(), show=False)
+    plt.savefig("./media_assets/tsne_trained.png")
+    plt.close()

sound_of_water/cosupervision/README.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ In this folder, we store code for co-supervising audio pitch detection network from
2	+ visual height detection network.

sound_of_water/data/__pycache__/audio_loader.cpython-39.pyc ADDED Viewed

Binary file (12.8 kB). View file

sound_of_water/data/__pycache__/audio_transforms.cpython-39.pyc ADDED Viewed

Binary file (5.45 kB). View file

sound_of_water/data/__pycache__/csv_loader.cpython-39.pyc ADDED Viewed

Binary file (3.32 kB). View file

sound_of_water/data/audio_loader.py ADDED Viewed

	@@ -0,0 +1,646 @@

+"""Audio loading utils."""
+import os
+import numpy as np
+import torch
+import torchaudio
+import decord
+import librosa
+import einops
+import PIL
+import matplotlib.pyplot as plt
+# Add serif font
+plt.rcParams['font.family'] = 'serif'
+from PIL import Image, ImageOps
+import librosa.display
+import shared.utils as su
+def read_info(path):
+    """
+    Reads the info of the given audio file.
+    Args:
+        path (str): path to the audio file
+    """
+    import ffmpeg
+    probe = ffmpeg.probe(path)
+    audio_info = next(
+        (s for s in probe['streams'] if s['codec_type'] == 'audio'),
+        None,
+    )
+    video_info = next(
+        (s for s in probe['streams'] if s['codec_type'] == 'video'),
+        None,
+    )
+    return dict(video=video_info, audio=audio_info)
+def load_audio_clips(
+        audio_path,
+        clips,
+        sr,
+        clip_len,
+        backend='decord',
+        load_entire=False,
+        cut_to_clip_len=True,
+    ):
+    """
+    Loads audio clips from the given audio file.
+    Args:
+        audio_path (str): path to the audio file
+        clips (np.ndarray): sized [T, 2], where T is the number of clips
+            and each row is a pair of start and end times of the clip
+        sr (int): sample rate
+        clip_len (float): length of the audio clip in seconds
+        backend (str): backend to use for loading audio clips
+        load_entire (bool): whether to load the entire audio file
+        cut_to_clip_len (bool): whether to cut the audio clip to clip_len
+    """
+    if backend == 'torchaudio':
+        audio_info = read_info(audio_path)["audio"]
+        true_sr = int(audio_info["sample_rate"])
+        true_nf = audio_info["duration_ts"]
+        audio_duration = true_nf / true_sr
+        # metadata = torchaudio.info(audio_path)
+        # true_sr = metadata.sample_rate
+        # true_nf = metadata.num_frames
+    elif backend == "decord":
+        # duration = librosa.get_duration(filename=audio_path)
+        ar = decord.AudioReader(audio_path, sample_rate=sr, mono=True)
+        # Mono=False gives NaNs in inputs.
+        # This (https://gist.github.com/nateraw/fcc2bdb9c8738224957c8617c3360445) might
+        # be a related issue. Ignoring for now. Need to use torchaudio for now.
+        true_nf = ar.shape[1]
+        audio_duration = ar.shape[1] / sr
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+    if load_entire:
+        # Load the entire audio as a single clip and return
+        if backend == 'torchaudio':
+            y, _ = torchaudio.load(audio_path)
+            if y.shape[0] > 1:
+                # Convert to a single channel
+                y = y.mean(dim=0, keepdim=True)
+            resampler = torchaudio.transforms.Resample(true_sr, sr)
+            y = resampler(y)
+            audio = y
+        elif backend == "decord":
+            audio = ar.get_batch(np.arange(true_nf)).asnumpy()
+            audio = torch.from_numpy(audio)
+        return [audio]
+    else:
+        # Clip the clips to avoid going out of bounds
+        clips = np.clip(clips, 0, audio_duration)
+    audio_clips = []
+    for st, et in clips:
+        if backend == 'torchaudio':
+            # Load audio within the given time range
+            sf = max(int(true_sr * st), 0)
+            ef = min(int(true_sr * et), true_nf)
+            nf = ef - sf
+            y, _ = torchaudio.load(audio_path, frame_offset=sf, num_frames=nf)
+            # Stereo to mono
+            if y.shape[0] > 1:
+                # Convert to a single channel
+                y = y.mean(dim=0, keepdim=True)
+            # Resample to the given sample rate
+            resampler = torchaudio.transforms.Resample(true_sr, sr)
+            y = resampler(y)
+            audio = y
+        elif backend == "decord":
+            # Load audio within the given time range
+            sf = max(int(st * sr), 0)
+            ef = min(int(et * sr), true_nf)
+            audio = ar.get_batch(np.arange(sf, ef)).asnumpy()
+            audio = torch.from_numpy(audio)
+            # No need to convert to mono since we are using mono=True
+            # No need to resample since we are using sample_rate=sr
+        else:
+            raise ValueError(f"Unknown backend: {backend}")
+        # Pad the clip to clip_len
+        nf_reqd = int(clip_len * sr)
+        nf_curr = audio.shape[1]
+        npad_side = max(0, nf_reqd - nf_curr)
+        if nf_curr < nf_reqd:
+            audio = torch.nn.functional.pad(audio, (0, npad_side))
+        elif (nf_curr > nf_reqd) and cut_to_clip_len:
+            audio = audio[:, :nf_reqd]
+        audio_clips.append(audio)
+    return audio_clips
+def show_audio_clips_waveform(
+        audio_clips, clips, title=None, show=True, figsize=(10, 2),
+    ):
+    """
+    Visualizes the given audio clips.
+    Args:
+        audio_clips (list): list of audio clips
+        sr (int): sample rate
+        title (str): title of the plot
+        show (bool): whether to show the clips
+        figsize (tuple): figure size
+    """
+    clip_centers = (clips[:, 0] + clips[:, 1]) / 2
+    clip_durations = clips[:, 1] - clips[:, 0]
+    fig, ax = plt.subplots(1, len(audio_clips), figsize=figsize)
+    if len(audio_clips) == 1:
+        ax = [ax]
+    for i, audio in enumerate(audio_clips):
+        timestamps = np.linspace(
+            clip_centers[i] - clip_durations[i],
+            clip_centers[i] + clip_durations[i],
+            audio.shape[-1],
+        )
+        ax[i].plot(timestamps, audio.squeeze().numpy(), alpha=0.5)
+        ax[i].set_title(f'$t=$ {clip_centers[i]:.2f}')
+        ax[i].grid(alpha=0.4)
+    plt.tight_layout()
+    if show:
+        plt.show()
+    else:
+        plt.savefig('audio_clips_waveform.png')
+# TODO: preprocess audio clips (e.g., wav-to-spectrogram, etc.)
+# Note that this is different from transforms applied as augmentation
+# during training. This is more like a preprocessing step that is applied
+# to the entire audio before sampling the clips.
+import torchaudio.functional as TAF
+import torchaudio.transforms as TAT
+def load_audio(path, sr=16000, **kwargs):
+    y, true_sr = torchaudio.load(path, **kwargs)
+    y = y.mean(dim=0, keepdim=True)
+    resampler = torchaudio.transforms.Resample(true_sr, sr)
+    y = resampler(y)
+    return y, sr
+def load_audio_librosa(path, sr=16000, **kwargs):
+    y, true_sr = librosa.load(path, sr=sr, **kwargs)
+    y = torch.from_numpy(y).unsqueeze(0)
+    return y, sr
+def librosa_harmonic_spectrogram_db(
+        y, sr=16000, n_fft=512, hop_length=256, margin=16., n_mels=64,
+    ):
+    if isinstance(y, torch.Tensor):
+        y = y.numpy()
+    if len(y.shape) == 2:
+        y = y.mean(axis=0)
+    # center=True outputs 1 more frame than center=False
+    # Currently, using just center=False
+    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, center=False)
+    DH, DP = librosa.decompose.hpss(D, margin=margin)
+    amplitude_h = np.sqrt(2) * np.abs(DH)
+    if n_mels is None:
+        # Usual dB spectrogram
+        SH = librosa.amplitude_to_db(amplitude_h, ref=np.max)
+    else:
+        # Mel-scaled dB spectrogram
+        S = librosa.amplitude_to_db(amplitude_h)
+        SH = librosa.feature.melspectrogram(S=S, n_mels=n_mels, sr=sr)
+    return SH
+def show_logmelspectrogram(
+        S,
+        sr,
+        n_fft=512,
+        hop_length=256,
+        figsize=(10, 3),
+        ax=None,
+        show=True,
+        title="LogMelSpectrogram",
+        xlabel="Time (s)",
+        ylabel="Mel bins (Hz)",
+        return_as_image=False,
+    ):
+    if ax is None:
+        fig, ax = plt.subplots(1, 1, figsize=figsize)
+    librosa.display.specshow(
+        S,
+        sr=sr,
+        hop_length=hop_length,
+        n_fft=n_fft,
+        y_axis='mel',
+        x_axis='time',
+        ax=ax,
+        auto_aspect=True,
+    )
+    ax.set_title(title)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    if return_as_image:
+        fig.canvas.draw()
+        image = PIL.Image.frombytes(
+            'RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb(),
+        )
+        plt.close(fig)
+        return image
+    if show:
+        plt.show()
+def show_logspectrogram(
+        S, sr, n_fft=512, hop_length=256, figsize=(10, 3), ax=None, show=True,
+    ):
+    if ax is None:
+        fig, ax = plt.subplots(1, 1, figsize=figsize)
+    librosa.display.specshow(
+        S,
+        sr=sr,
+        hop_length=hop_length,
+        n_fft=n_fft,
+        y_axis='linear',
+        x_axis='time',
+        ax=ax,
+    )
+    ax.set_title("LogSpectrogram")
+    if show:
+        plt.show()
+def audio_clips_wav_to_spec(
+        audio_clips, n_fft=512, hop_length=256, margin=16., n_mels=None,
+    ):
+    """
+    Converts the given audio clips to spectrograms.
+    Args:
+        audio_clips (list): list of audio clips
+        n_fft (int): number of FFT points
+        hop_length (int): hop length
+        margin (float): margin for harmonic-percussive source separation
+        n_mels (int): number of mel bands (optional, if None, then dB spectrogram is returned)
+    """
+    audio_specs = []
+    for audio in audio_clips:
+        spec = librosa_harmonic_spectrogram_db(
+            audio,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            margin=margin,
+            n_mels=n_mels,
+        )
+        spec = torch.from_numpy(spec).unsqueeze(0)
+        audio_specs.append(spec)
+    return audio_specs
+def show_audio_clips_spec(
+        audio_specs,
+        clips,
+        sr,
+        n_fft=512,
+        hop_length=256,
+        margin=16.,
+        cmap='magma',
+        n_mels=None,
+        show=True,
+    ):
+    """
+    Visualizes the given audio clips.
+    Args:
+        audio_specs (list): list of audio spectrograms
+        clips (np.ndarray): sized [T, 2], where T is the number of clips
+            and each row is a pair of start and end times of the clip
+        show (bool): whether to show the clips
+    """
+    clip_centers = (clips[:, 0] + clips[:, 1]) / 2
+    clip_durations = clips[:, 1] - clips[:, 0]
+    fig, ax = plt.subplots(1, len(audio_specs), figsize=(10, 4))
+    if len(audio_specs) == 1:
+        ax = [ax]
+    for i, spec in enumerate(audio_specs):
+        clip_start = clips[i][0]
+        # ax[i].imshow(spec, aspect='auto', origin='lower')
+        if isinstance(spec, torch.Tensor):
+            spec = spec.numpy()
+        if len(spec.shape) == 3:
+            spec = spec[0]
+        args = dict(
+            data=spec,
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            ax=ax[i],
+            x_axis="time",
+            cmap=cmap,
+        )
+        if n_mels is None:
+            args.update(dict(y_axis="linear"))
+        else:
+            args.update(dict(y_axis="mel"))
+        librosa.display.specshow(**args)
+        # Get xticks and replace them by xticks + clip_start
+        xticks = ax[i].get_xticks()
+        xticks = xticks + clip_start
+        ax[i].set_xticklabels([f'{x:.1f}' for x in xticks])
+        ax[i].set_title(f'$t=$ {clip_centers[i]:.2f}')
+    plt.tight_layout()
+    if show:
+        plt.show()
+    else:
+        plt.savefig('audio_clips_spec.png')
+def basic_pipeline_audio_clips(
+        audio_clips,
+        spec_args=None,
+        audio_transform=None,
+        stack=True,
+    ):
+    wave_transform = audio_transform.get('wave', None)
+    spec_transform = audio_transform.get('spec', None)
+    # Apply transforms to raw waveforms
+    if wave_transform is not None:
+        audio_clips = wave_transform(audio_clips)
+    if spec_args is not None:
+        # Convert waveforms to spectrograms
+        audio_clips = audio_clips_wav_to_spec(audio_clips, **spec_args)
+        # Apply transforms to spectrograms
+        if spec_transform is not None:
+            audio_clips = spec_transform(audio_clips)
+    if stack:
+        audio_clips = torch.stack(audio_clips)
+    return audio_clips
+def load_and_process_audio(
+        audio_path,
+        clips,
+        cut_to_clip_len=True,
+        load_entire=False,
+        audio_transform=None,
+        aload_args=dict(),
+        apipe_args=dict(),
+    ):
+    """Loads and preprocess audio."""
+    # [C1] Load video clips: List[torch.Tensor]
+    audio_clips = load_audio_clips(
+        audio_path=audio_path,
+        clips=clips,
+        load_entire=load_entire,
+        cut_to_clip_len=cut_to_clip_len,
+        **aload_args,
+    )
+    # [C2] Pipeline:  [Preprocessing -> Transform]
+    audio_clips = basic_pipeline_audio_clips(
+        audio_clips=audio_clips,
+        audio_transform=audio_transform,
+        **apipe_args,
+    )
+    return audio_clips
+def crop_height(image, height):
+    """Crops image from the top and bottom to the desired height."""
+    width, curr_height = image.size
+    if curr_height < height:
+        raise ValueError(f"Height of the image is less than {height}")
+    top = (curr_height - height) // 2
+    bottom = top + height
+    return image.crop((0, top, width, bottom))
+def pad_to_height(image, height):
+    """Pads image with black strips at the top and bottom."""
+    width, curr_height = image.size
+    if curr_height > height:
+        raise ValueError(f"Height of the image is already greater than {height}")
+    top = (height - curr_height) // 2
+    bottom = height - curr_height - top
+    return ImageOps.expand(image, (0, top, 0, bottom), fill="black")
+def crop_width(image, width):
+    """Crops image from the left and right to the desired width."""
+    curr_width, height = image.size
+    if curr_width < width:
+        raise ValueError(f"Width of the image is less than {width}")
+    left = (curr_width - width) // 2
+    right = left + width
+    return image.crop((left, 0, right, height))
+def crop_or_pad_height(image, height):
+    """Crops or pads image to the desired height."""
+    width, curr_height = image.size
+    if curr_height < height:
+        return pad_to_height(image, height)
+    elif curr_height > height:
+        return crop_height(image, height)
+    return image
+def crop_or_pad_width(image, width):
+    """Crops or pads image to the desired width."""
+    curr_width, height = image.size
+    if curr_width < width:
+        return pad_to_width(image, width)
+    elif curr_width > width:
+        return crop_width(image, width)
+    return image
+def pad_to_width(image, width):
+    """Pads image with black strips at the left and right."""
+    curr_width, height = image.size
+    if curr_width > width:
+        raise ValueError(f"Width of the image is already greater than {width}")
+    left = (width - curr_width) // 2
+    right = width - curr_width - left
+    return ImageOps.expand(image, (left, 0, right, 0), fill="black")
+def crop_or_pad_to_size(image, size=(270, 480)):
+    """Crops or pads image to the desired size."""
+    image = crop_or_pad_height(image, size[1])
+    image = crop_or_pad_width(image, size[0])
+    return image
+if __name__ == "__main__":
+    import decord
+    import sound_of_water.data.audio_transforms as at
+    # Testing on a sample file
+    file_path = "media_assets/ayNzH0uygFw_9.0_21.0.mp4"
+    assert os.path.exists(file_path), f"File not found: {file_path}"
+    # Define audio transforms
+    cfg_transform = {
+        "audio": {
+            "wave": [
+                {
+                    "name": "AddNoise",
+                    "args": {
+                    "noise_level": 0.001
+                    },
+                    "augmentation": True,
+                },
+                {
+                    "name": "ChangeVolume",
+                    "args": {
+                    "volume_factor": [0.8, 1.2]
+                    },
+                    "augmentation": True,
+                },
+                {
+                    "name": "Wav2Vec2WaveformProcessor",
+                    "args": {
+                    "model_name": "facebook/wav2vec2-base-960h",
+                    "sr": 16000
+                    }
+                }
+            ],
+            "spec": None,
+        }
+    }
+    audio_transform = at.define_audio_transforms(
+        cfg_transform, augment=False,
+    )
+    # Define audio load arguments
+    aload_args = {
+        "sr": 16000,
+        "clip_len": None,
+        "backend": "decord",
+    }
+    # Define audio pipeline arguments
+    apipe_args = {
+        "spec_args": None,
+        "stack": True,
+    }
+    # Run the pipeline (this is used to pass to the model)
+    audio = load_and_process_audio(
+        audio_path=file_path,
+        clips=None,
+        load_entire=True,
+        cut_to_clip_len=False,
+        audio_transform=audio_transform,
+        aload_args=aload_args,
+        apipe_args=apipe_args,
+    )[0]
+    # This will be used to visualise
+    visualise_args = {
+        "sr": 16000,
+        "n_fft": 400,
+        "hop_length": 320,
+        "n_mels": 64,
+        "margin": 16.,
+        "C": 340 * 100.,
+        "audio_output_fps": 49.,
+    }
+    y = load_audio_clips(
+        audio_path=file_path,
+        clips=None,
+        load_entire=True,
+        cut_to_clip_len=False,
+        **aload_args,
+    )[0]
+    S = librosa_harmonic_spectrogram_db(
+        y,
+        sr=visualise_args["sr"],
+        n_fft=visualise_args["n_fft"],
+        hop_length=visualise_args["hop_length"],
+        n_mels=visualise_args['n_mels'],
+    )
+    # Load video frame
+    vr = decord.VideoReader(file_path, num_threads=1)
+    frame = PIL.Image.fromarray(vr[0].asnumpy())
+    """
+    # Cut to desired width
+    new_width, new_height = 270, 480
+    width, height = frame.size
+    if width > new_width:
+        # Crop the width
+        left = (width - new_width) // 2
+        right = left + new_width
+        frame = frame.crop((left, 0, right, height))
+    else:
+        # Resize along width to have the desired width
+        frame = su.visualize.resize_width(frame, new_width)
+    assert frame.size[0] == new_width, \
+        f"Width mismatch: {frame.size[0]} != {new_width}"
+    # Now pad/crop to desired height
+    if height > new_height:
+        # Crop the height
+        top = (height - new_height) // 2
+        bottom = top + new_height
+        frame = frame.crop((0, top, new_width, bottom))
+    else:
+        # Pad the height
+        frame = pad_to_height(frame, new_height)
+    assert frame.size[1] == new_height, \
+        f"Height mismatch: {frame.size[1]} != {new_height}"
+    """
+    frame = crop_or_pad_to_size(frame)
+    # frame.save("1.png")
+    # Visualise
+    fig, axes = plt.subplots(
+        1, 2, figsize=(13, 4), width_ratios=[0.25, 0.75],
+    )
+    ax = axes[0]
+    ax.imshow(frame, aspect="auto")
+    ax.set_title("Example frame")
+    ax.set_xticks([])
+    ax.set_yticks([])
+    ax = axes[1]
+    show_logmelspectrogram(
+        S=S,
+        ax=ax,
+        show=False,
+        sr=visualise_args["sr"],
+        n_fft=visualise_args["n_fft"],
+        hop_length=visualise_args["hop_length"],
+    )
+    plt.savefig("./media_assets/audio_visualisation.png", bbox_inches="tight")
+    plt.close()

sound_of_water/data/audio_transforms.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""Audio transforms."""
+import torchaudio
+import torchvision
+from torchvision.transforms import Compose, ToTensor
+import torchaudio.transforms as T
+import imgaug.augmenters as iaa
+import numpy as np
+import torch
+class AddNoise(object):
+    """Add noise to the waveform."""
+    def __init__(self, noise_level=0.1):
+        self.noise_level = noise_level
+    def __call__(self, waveform):
+        noise = torch.randn_like(waveform)
+        return waveform + self.noise_level * noise
+    def __repr__(self):
+        return self.__class__.__name__ + f"(noise_level={self.noise_level})"
+class ChangeVolume(object):
+    """Change the volume of the waveform."""
+    def __init__(self, volume_factor=[0.6, 1.2]):
+        self.volume_factor = volume_factor
+    def __call__(self, waveform):
+        return waveform * np.random.uniform(*self.volume_factor)
+    def __repr__(self):
+        return self.__class__.__name__ + f"(volume_factor={self.volume_factor})"
+def configure_transforms(cfg):
+    """
+    Given a transform config (List[dict]), return a Compose object that
+    applies the transforms in order.
+    """
+    transform = []
+    for a in cfg:
+        transform.append(eval(a["name"])(**a["args"]))
+    return Compose(transform)
+class AudioClipsTransform:
+    def __init__(self, audio_transform):
+        """Applies image transform to each frame of each video clip."""
+        self.audio_transform = audio_transform
+    def __call__(self, audio_clips):
+        """
+        Args:
+            audio_clips (list): list of audio clips, each tensor [1, M]
+                where M is number of samples in each clip
+        """
+        transformed_audio_clips = [self.audio_transform(x) for x in audio_clips]
+        # transformed_audio_clips = []
+        # for clip in audio_clips:
+        #     transformed_clip = [self.audio_transform(x) for x in clip]
+        #     transformed_audio_clips.append(transformed_clip)
+        return transformed_audio_clips
+    def __repr__(self):
+        return self.audio_transform.__repr__()
+class NumpyToTensor:
+    def __call__(self, x):
+        return torch.from_numpy(x).float()
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
+# TODO: Might have to introduce normalisation
+# to have a consistent pipeline.
+class Wav2Vec2WaveformProcessor:
+    def __init__(self, model_name="facebook/wav2vec2-base-960h", sr=16000):
+        from transformers import Wav2Vec2Processor
+        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
+        self.sr = sr
+    def __call__(self, x):
+        x = self.processor(
+            x, sampling_rate=self.sr, return_tensors="pt",
+        ).input_values
+        return x
+def define_audio_transforms(cfg_transform, augment=False):
+    wave_transforms = cfg_transform["audio"]["wave"]
+    wave_transforms_new = []
+    # Only pick augmentations if augment=True
+    for t in wave_transforms:
+        if "augmentation" not in t:
+            wave_transforms_new.append(t)
+        else:
+            if augment and t["augmentation"]:
+                wave_transforms_new.append(t)
+    # print(wave_transforms_new)
+    wave_transform = configure_transforms(wave_transforms_new)
+    wave_transform = AudioClipsTransform(wave_transform)
+    # wave_transform = configure_transforms(
+    #     cfg_transform["audio"]["wave"],
+    # )
+    # wave_transform = AudioClipsTransform(wave_transform)
+    # spec_transform = configure_transforms(
+    #     cfg_transform["audio"]["spec"],
+    # )
+    # spec_transform = AudioClipsTransform(spec_transform)
+    audio_transform = dict(
+        wave=wave_transform,
+        # spec=spec_transform,
+    )
+    return audio_transform
+if __name__ == "__main__":
+    # Testing it out
+    # Raw waveform transform
+    cfg = [
+        {
+            "name": "AddNoise",
+            "args": {"noise_level": 0.1},
+        },
+        {
+            "name": "ChangeVolume",
+            "args": {"volume_factor": [0.6, 1.2]},
+        },
+    ]
+    transform = configure_transforms(cfg)
+    x = torch.randn([1, 16000])
+    z = transform(x)
+    print(x.shape, z.shape)
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots(2, 1, figsize=(8, 4))
+    ax[0].plot(x[0].numpy())
+    ax[1].plot(z[0].numpy())
+    plt.savefig("waveform_transform.png")
+    # Wav2Vec2 transform
+    cfg = [
+        {
+            "name": "Wav2Vec2WaveformProcessor",
+            "args": {"model_name": "facebook/wav2vec2-base-960h", "sr": 16000},
+        },
+    ]
+    transform = configure_transforms(cfg)
+    x = torch.randn([4, 16000])
+    z = transform(x)
+    print(x.shape, z.shape)
+    # Spectrogram transform
+    cfg = [
+        {
+            "name": "T.FrequencyMasking",
+            "args": {"freq_mask_param": 8},
+        },
+        {
+            "name": "T.TimeMasking",
+            "args": {"time_mask_param": 16},
+        },
+    ]
+    transform = configure_transforms(cfg)
+    x = torch.randn([1, 64, 251])
+    z = transform(x)
+    print(x.shape, z.shape)
+    fig, ax = plt.subplots(2, 1, figsize=(8, 4))
+    ax[0].imshow(x[0].numpy())
+    ax[1].imshow(z[0].numpy())
+    plt.savefig("spectrogram_transform.png")

sound_of_water/data/csv_loader.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""Utils to load CSV file of audio datasets."""
+import os
+import pandas as pd
+import shared.utils as su
+def configure_paths_sound_of_water(
+        data_root="/work/piyush/from_nfs2/datasets/SoundOfWater",
+    ):
+    paths = {
+        "data_dir": data_root,
+        "video_clip_dir": os.path.join(data_root, "videos"),
+        "audio_clip_dir": os.path.join(data_root, "videos"),
+        "annot_dir": os.path.join(data_root, "annotations"),
+        "split_dir": os.path.join(data_root, "splits"),
+    }
+    return paths
+def load_csv_sound_of_water(
+        paths: dict,
+        csv_filters=dict(),
+        csv_name="localisation.csv",
+        ds_name="SoundOfWater",
+        split=None,
+        check_first_frame_annots=True,
+    ):
+    """Loads CSV containing metadata of the dataset."""
+    su.log.print_update(
+        f" [:::] Loading {ds_name}.",
+        pos="left",
+        fillchar=".",
+    )
+    # Configure paths
+    video_clip_dir = paths["video_clip_dir"]
+    audio_clip_dir = paths["audio_clip_dir"]
+    # Load main CSV
+    path = os.path.join(
+        paths["annot_dir"], csv_name,
+    )
+    assert os.path.exists(path), \
+        f"CSV file not found at {path}."
+    print(" [:::] CSV path:", path)
+    df = pd.read_csv(path)
+    # Load side information: containers
+    container_path = os.path.join(
+        paths['annot_dir'], "containers.yaml",
+    )
+    assert os.path.exists(container_path)
+    containers = su.io.load_yml(container_path)
+    # Update CSV with container information (optional)
+    update_with_container_info = True
+    if update_with_container_info:
+        rows = []
+        for row in df.iterrows():
+            row = row[1].to_dict()
+            row.update(containers[row["container_id"]])
+            rows.append(row)
+        df = pd.DataFrame(rows)
+    print(" [:::] Shape of CSV: ", df.shape)
+    # 1. Update item_id
+    df["item_id"] = df.apply(
+        lambda d: f"{d['video_id']}_{d['start_time']:.1f}_{d['end_time']:.1f}",
+        axis=1,
+    )
+    # 2. Update video_clip_path
+    # df["video_path"] = df["video_id"].apply(
+    #     lambda d: os.path.join(
+    #         video_dir, f"{d}.mp4"
+    #     )
+    # )
+    df["video_clip_path"] = df["item_id"].apply(
+        lambda d: os.path.join(
+            video_clip_dir, f"{d}.mp4"
+        )
+    )
+    df = df[df["video_clip_path"].apply(os.path.exists)]
+    print(" [:::] Shape of CSV with available video: ", df.shape)
+    # 3. Update audio_clip_path
+    # df["audio_path"] = df["video_id"].apply(
+    #     lambda d: os.path.join(
+    #         audio_dir, f"{d}.mp4"
+    #     )
+    # )
+    df["audio_clip_path"] = df["item_id"].apply(
+        lambda d: os.path.join(
+            audio_clip_dir, f"{d}.mp4"
+        )
+    )
+    df = df[df["audio_clip_path"].apply(os.path.exists)]
+    print(" [:::] Shape of CSV with available audio: ", df.shape)
+    # Add first frame annotation paths
+    if check_first_frame_annots:
+        frame_annot_dir = os.path.join(paths["annot_dir"], "container_bboxes")
+        df["box_path"] = df["video_id"].apply(
+            lambda d: os.path.join(frame_annot_dir, f"{d}_box.npy"),
+        )
+        df["mask_path"] = df["video_id"].apply(
+            lambda d: os.path.join(frame_annot_dir, f"{d}_mask.npy"),
+        )
+        df = df[df["box_path"].apply(os.path.exists)]
+        df = df[df["mask_path"].apply(os.path.exists)]
+        print(" [:::] Shape of CSV with first frame annotations: ", df.shape)
+    # Add split filter
+    if split is not None and ("item_id" not in csv_filters):
+        assert "split_dir" in paths
+        split_path = os.path.join(paths["split_dir"], f"{split}")
+        assert os.path.exists(split_path), \
+            f"Split file not found at {split_path}."
+        item_ids = su.io.load_txt(split_path)
+        print(" [:::] Number of item_ids in split:", len(item_ids))
+        csv_filters["item_id"] = item_ids
+    # Apply filter to the CSV
+    if len(csv_filters) > 0:
+        df = su.pd_utils.apply_filters(df, csv_filters)
+        print(" [:::] Shape of CSV after filtering: ", df.shape)
+    return df
+if __name__ == "__main__":
+    paths = configure_paths_sound_of_water()
+    df = load_csv_sound_of_water(paths)
+    row = df.iloc[0].to_dict()
+    su.log.json_print(row)

sound_of_water/data/video_loader.py ADDED Viewed

File without changes

sound_of_water/data/video_transforms.py ADDED Viewed

File without changes

sound_of_water/video_height/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ In this folder, we will store the code to train and evaluate models for liquid height detection from video.