Batched inference does not yield same results as individual sample inference

#8
by ladhiet - opened

Hi

First of all, thanks for the great work!

I wanted to use this model for batched inference. However when doing so, I noticed the results were different than doing inference on the individual samples differently. The following adapted example code can be run as a minimal reproducible example:

import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)


class RegressionHead(nn.Module):
    r"""Classification head."""

    def __init__(self, config):

        super().__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):

        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x


class EmotionModel(Wav2Vec2PreTrainedModel):
    r"""Speech emotion classifier."""

    def __init__(self, config):

        super().__init__(config)

        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = RegressionHead(config)
        self.init_weights()

    def forward(
            self,
            input_values,
            attention_mask,
    ):

        outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits = self.classifier(hidden_states)

        return hidden_states, logits



# load model from hub
device = 'cpu'
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name)

# dummy signal
sampling_rate = 16000
signals = []
for i in range(64):
    signal = np.random.rand(5 * 16000)
    # randomly truncate signal
    signal = signal[:int(random.uniform(0, 5) * 16000)]
    signals.append(signal)

def process_func(
    file_path: str,
    sampling_rate: int,
    embeddings: bool = False,
) -> np.ndarray:
    r"""Predict emotions or extract embeddings from raw audio signal."""

    inputs = processor(signals, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

    # batched_inference
    with torch.no_grad():
        y = model(inputs["input_values"], inputs["attention_mask"])[1]

    y = y.detach().cpu().numpy()
    pd.DataFrame({"valence": y[:, 0], "arousal": y[:, 1]}).to_csv('batched_audio_predictions.csv')

    # non-batched inference without attention mask -> different outputs
    y2_valence = []
    y2_arousal = []
    for input_values, attention_mask in zip(inputs["input_values"], inputs["attention_mask"]):
        y2 = model(input_values[attention_mask].reshape(1, -1), None)
        y2_valence.append(y2[1][0][0].detach().cpu().numpy())
        y2_arousal.append(y2[1][0][1].detach().cpu().numpy())
    
    pd.DataFrame({"valence": y2_valence, "arousal": y2_arousal}).to_csv('non_batched_audio_predictions.csv')

    # non-batched inference with attention mask -> same outputs as the batched_inference
    y2_valence = []
    y2_arousal = []
    for input_values, attention_mask in zip(inputs["input_values"], inputs["attention_mask"]):
        y2 = model(input_values.unsqueeze(0), attention_mask.unsqueeze(0))
        y2_valence.append(y2[1][0][0].detach().cpu().numpy())
        y2_arousal.append(y2[1][0][1].detach().cpu().numpy())
    
    pd.DataFrame({"valence": y2_valence, "arousal": y2_arousal}).to_csv('non_batched_audio_with_mask_predictions.csv')

    return y


print(process_func("batched_audio_example.pt", sampling_rate))
#  Arousal    dominance valence
# [[0.5460754  0.6062266  0.40431657]]

Any ideas or experience as to why this is happening? It seems to me the attention mask (as I do in the example above) should be passed when doing batched inference as the "feat_extract_norm" parameter of the config is "layer" as stated in the Wav2Vec2 documentation:

image.png

Thanks in advance for your time and answer!

audEERING GmbH org

Hi!
Thanks a lot for your interest in the model.

I see several issues in the code you submitted:

  1. EmotionModel.forward() passes the attention_mask to self.wav2vec2(), but it is not considered when computing the mean across the embeddings.
    To solve this, you can replace the line
        hidden_states = torch.mean(hidden_states, dim=1)

by

        if attention_mask is not None:
            attention_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
            hidden_states = hidden_states * torch.reshape(attention_mask, (-1, attention_mask.shape[-1], 1))
            hidden_states = torch.sum(hidden_states, dim=1)
            attention_sum = torch.sum(attention_mask, dim=1)
            hidden_states = hidden_states / torch.reshape(attention_sum, (-1, 1))
        else:
            hidden_states = torch.mean(hidden_states, dim=1)
  1. In the second inference option (non-batched inference without attention mask), where the attention is intended to be applied before the model is called, attention_mask is of type torch.int32. With input_values[attention_mask], input_values is sliced (with 0 / 1 indexes). In order to apply a mask, attention_mask needs to be converted to (e.g.) boolean, i.e. attention_mask.bool() for torch.Tensor.

Applying these two changes will result in identical predictions.
In fact, all three options resulted in "wrong" predictions in the original, but the impact of issue 1. (which has an impact on the 1st and the 3rd inference), is not as severe as the impact of issue 2.

There are two more problems:

  1. The order of the output logits is: Arousal, Dominance, Valence (you are assuming Valence, Arousal in your code)
  2. The model has a minimum requirement for the input length. Thus, in some rare cases, the inference might fail if one of the input signals has a very short duration (<< 1.0s).

Please find the consolidated code below:

Click to expand code
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)


class RegressionHead(nn.Module):
    r"""Classification head."""

    def __init__(self, config):

        super().__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):

        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x


class EmotionModel(Wav2Vec2PreTrainedModel):
    r"""Speech emotion classifier."""

    def __init__(self, config):

        super().__init__(config)

        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = RegressionHead(config)
        self.init_weights()

    def forward(
            self,
            input_values,
            attention_mask,
    ):

        outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
        hidden_states = outputs[0]
        if attention_mask is not None:
            attention_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
            hidden_states = hidden_states * torch.reshape(attention_mask, (-1, attention_mask.shape[-1], 1))
            hidden_states = torch.sum(hidden_states, dim=1)
            attention_sum = torch.sum(attention_mask, dim=1)
            hidden_states = hidden_states / torch.reshape(attention_sum, (-1, 1))
        else:
            hidden_states = torch.mean(hidden_states, dim=1)
        logits = self.classifier(hidden_states)

        return hidden_states, logits


# load model from hub
device = 'cpu'
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name)

# dummy signal
sampling_rate = 16000
signals = []
for i in range(64):
    signal = np.random.rand(5 * 16000)
    # randomly truncate signal
    signal = signal[:int(random.uniform(0.1, 5) * 16000)]
    signals.append(signal)


def process_func(
    file_path: str,
    sampling_rate: int,
    embeddings: bool = False,
) -> np.ndarray:
    r"""Predict emotions or extract embeddings from raw audio signal."""

    inputs = processor(signals, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

    # batched_inference
    with torch.no_grad():
        y = model(inputs["input_values"], inputs["attention_mask"])[1]

    y = y.detach().cpu().numpy()
    pd.DataFrame({"valence": y[:, 0], "arousal": y[:, 1]}).to_csv('batched_audio_predictions.csv')

    # non-batched inference without attention mask -> different outputs
    y2_valence = []
    y2_arousal = []
    for input_values, attention_mask in zip(inputs["input_values"], inputs["attention_mask"]):
        y2 = model(input_values[attention_mask.bool()].reshape(1, -1), None)
        y2_valence.append(y2[1][0][0].detach().cpu().numpy())
        y2_arousal.append(y2[1][0][1].detach().cpu().numpy())

    pd.DataFrame({"valence": y2_valence, "arousal": y2_arousal}).to_csv('non_batched_audio_predictions.csv')

    # non-batched inference with attention mask -> same outputs as the batched_inference
    y2_valence = []
    y2_arousal = []
    for input_values, attention_mask in zip(inputs["input_values"], inputs["attention_mask"]):
        y2 = model(input_values.unsqueeze(0), attention_mask.unsqueeze(0))
        y2_valence.append(y2[1][0][0].detach().cpu().numpy())
        y2_arousal.append(y2[1][0][1].detach().cpu().numpy())

    pd.DataFrame({"valence": y2_valence, "arousal": y2_arousal}).to_csv('non_batched_audio_with_mask_predictions.csv')

    return y


print(process_func("batched_audio_example.pt", sampling_rate))
#  Arousal    dominance valence
# [[0.5460754  0.6062266  0.40431657]]

Sign up or log in to comment