m-a-p
/

MERT-v0

+A simple use case:
+```shell
+from transformers import Wav2Vec2Processor, AutoModel
+import torch
+from torch import nn
+from datasets import load_dataset
+# load demo audio and set processor
+dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+dataset = dataset.sort("id")
+sampling_rate = dataset.features["audio"].sampling_rate
+processor = Wav2Vec2Processor.from_pretrained("facebook/data2vec-audio-base-960h")
+# loading our model weights
+model = AutoModel.from_pretrained("m-a-p/MERT-v0")
+# audio file is decoded on the fly
+inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+with torch.no_grad():
+    outputs = model(**inputs, output_hidden_states=True)
+# take a look at the output shape, there are 13 layers of representation
+# each layer performs differently in different downstream tasks, you should choose empirically
+all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze()
+print(all_layer_hidden_states.shape) # [13 layer, 292 timestep, 768 feature_dim]
+# for utterance level classification tasks, you can simply reduce the representation in time
+time_reduced_hidden_states = all_layer_hidden_states.mean(-2)
+print(time_reduced_hidden_states.shape) # [13, 768]
+# you can even use a learnable weighted average representation
+aggregator = nn.Conv1d(in_channels=13, out_channels=1, kernel_size=1)
+weighted_avg_hidden_states = aggregator(time_reduced_hidden_states).squeeze()
+print(weighted_avg_hidden_states.shape) # [768]
+```