{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from datasets import load_dataset\n", "from transformers import ClapModel, AutoProcessor\n", "from IPython.display import Audio\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Found cached dataset parquet (/root/.cache/huggingface/datasets/ashraq___parquet/ashraq--esc50-1000c3b73cc1500f/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n", "100%|██████████| 1/1 [00:00<00:00, 524.29it/s]\n" ] } ], "source": [ "dataset = load_dataset('ashraq/esc50')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio_sample = dataset[\"train\"][\"audio\"][50]['array']\n", "Audio(audio_sample, rate=44100)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "model = ClapModel.from_pretrained(\"laion/clap-htsat-unfused\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "processor = AutoProcessor.from_pretrained(\"laion/clap-htsat-unfused\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "input_text = [\"Hospital elevator\", \"Water Drop\", \"Sound of water dropping\"]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n" ] } ], "source": [ "inputs = processor(text=input_text, audios=audio_sample, return_tensors='pt', padding=True)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['input_ids', 'attention_mask', 'input_features'])\n" ] } ], "source": [ "print(inputs.keys())" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[7.4354e-04, 4.5361e-02, 9.5390e-01]], grad_fn=)\n" ] } ], "source": [ "outputs = model(**inputs)\n", "logits_per_audio = outputs.logits_per_audio\n", "probs = logits_per_audio.softmax(dim=-1)\n", "print(probs)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from transformers import ClapAudioModel, ClapAudioModelWithProjection" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at laion/clap-htsat-unfused were not used when initializing ClapAudioModel: ['audio_model.audio_encoder.layers.0.blocks.0.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.key.weight', 'audio_model.audio_encoder.layers.0.blocks.0.layernorm_after.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.1.blocks.0.intermediate.dense.weight', 'audio_model.audio_encoder.layers.0.downsample.norm.bias', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.5.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.3.layernorm_before.weight', 'text_model.encoder.layer.7.attention.output.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.1.intermediate.dense.bias', 'text_model.encoder.layer.0.intermediate.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.value.weight', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.value.bias', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.relative_position_bias_table', 'text_model.encoder.layer.1.output.LayerNorm.weight', 'text_model.encoder.layer.5.attention.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.2.blocks.5.attention.output.dense.bias', 'text_model.encoder.layer.11.intermediate.dense.weight', 'text_model.encoder.layer.10.attention.self.query.bias', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.relative_position_index', 'text_model.encoder.layer.2.attention.output.dense.bias', 'text_model.encoder.layer.11.attention.self.value.bias', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.key.bias', 'text_model.encoder.layer.4.attention.output.dense.bias', 'text_model.encoder.layer.7.attention.self.query.bias', 'text_model.encoder.layer.11.attention.output.LayerNorm.bias', 'text_model.encoder.layer.2.attention.output.dense.weight', 'audio_model.audio_encoder.layers.3.blocks.1.intermediate.dense.bias', 'text_model.encoder.layer.10.attention.self.key.bias', 'text_model.encoder.layer.6.attention.output.LayerNorm.weight', 'text_model.embeddings.position_embeddings.weight', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.relative_position_index', 'text_model.encoder.layer.5.attention.self.value.weight', 'text_model.encoder.layer.10.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.0.layernorm_after.bias', 'text_model.encoder.layer.3.output.dense.bias', 'text_model.encoder.layer.2.output.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.0.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.4.attention.output.dense.bias', 'text_model.encoder.layer.1.attention.self.key.bias', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.key.bias', 'audio_model.audio_encoder.layers.1.blocks.1.layernorm_before.bias', 'audio_model.audio_encoder.layers.2.blocks.0.layernorm_after.weight', 'text_model.encoder.layer.2.attention.self.query.bias', 'text_model.encoder.layer.5.attention.self.key.weight', 'text_model.encoder.layer.11.intermediate.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.0.intermediate.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.query.bias', 'audio_model.audio_encoder.layers.3.blocks.0.attention.output.dense.bias', 'text_model.encoder.layer.6.attention.output.LayerNorm.bias', 'audio_model.audio_encoder.layers.2.blocks.0.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.1.attention.output.dense.weight', 'audio_model.audio_encoder.layers.3.blocks.0.layernorm_after.weight', 'text_model.encoder.layer.3.output.LayerNorm.weight', 'text_model.encoder.layer.0.output.dense.bias', 'audio_model.audio_encoder.batch_norm.weight', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.query.bias', 'text_model.encoder.layer.10.output.LayerNorm.weight', 'text_model.encoder.layer.2.attention.self.key.bias', 'text_model.encoder.layer.6.output.dense.weight', 'text_model.encoder.layer.8.attention.output.LayerNorm.weight', 'text_model.encoder.layer.4.output.LayerNorm.bias', 'audio_model.audio_encoder.layers.2.blocks.3.intermediate.dense.weight', 'text_model.encoder.layer.8.output.dense.weight', 'audio_model.audio_encoder.layers.1.blocks.0.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.3.attention.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.4.output.dense.bias', 'audio_model.audio_encoder.patch_embed.proj.bias', 'audio_model.audio_encoder.layers.2.blocks.4.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.query.weight', 'audio_model.audio_encoder.layers.3.blocks.0.attention.output.dense.weight', 'text_model.encoder.layer.7.intermediate.dense.weight', 'audio_model.audio_encoder.layers.3.blocks.0.layernorm_before.weight', 'audio_model.audio_encoder.layers.2.blocks.2.attention.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.5.attention.output.dense.weight', 'text_model.encoder.layer.1.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.relative_position_index', 'text_model.encoder.layer.6.attention.self.value.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.value.bias', 'text_model.encoder.layer.11.output.LayerNorm.bias', 'text_model.encoder.layer.3.attention.self.value.bias', 'text_model.encoder.layer.3.attention.output.LayerNorm.bias', 'audio_model.audio_encoder.layers.0.downsample.norm.weight', 'audio_model.audio_encoder.layers.2.blocks.1.intermediate.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.2.output.dense.weight', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.4.output.dense.weight', 'audio_model.audio_encoder.layers.0.blocks.0.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.2.layernorm_before.bias', 'text_model.encoder.layer.4.intermediate.dense.weight', 'audio_model.audio_encoder.layers.1.downsample.norm.weight', 'audio_model.audio_encoder.layers.2.blocks.4.layernorm_before.bias', 'text_projection.linear2.bias', 'text_model.encoder.layer.4.attention.self.query.weight', 'text_model.encoder.layer.3.attention.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.2.downsample.norm.bias', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.query.bias', 'text_model.encoder.layer.7.attention.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.3.layernorm_before.bias', 'text_model.encoder.layer.6.attention.self.key.weight', 'text_projection.linear1.bias', 'text_model.encoder.layer.8.attention.self.value.weight', 'text_model.encoder.layer.8.intermediate.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.1.layernorm_after.bias', 'audio_model.audio_encoder.batch_norm.running_mean', 'text_model.encoder.layer.10.attention.self.key.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.2.output.dense.bias', 'text_model.encoder.layer.4.output.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.1.layernorm_before.weight', 'text_model.embeddings.word_embeddings.weight', 'audio_model.audio_encoder.layers.1.blocks.0.output.dense.weight', 'text_model.embeddings.token_type_ids', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.query.bias', 'text_model.encoder.layer.6.attention.output.dense.bias', 'text_model.encoder.layer.10.intermediate.dense.weight', 'text_model.encoder.layer.5.output.dense.bias', 'text_model.encoder.layer.11.attention.self.value.weight', 'text_model.encoder.layer.1.attention.self.query.bias', 'audio_model.audio_encoder.layers.1.blocks.0.layernorm_after.weight', 'text_model.encoder.layer.11.output.dense.weight', 'text_model.encoder.layer.11.attention.self.query.weight', 'text_model.encoder.layer.2.output.LayerNorm.bias', 'audio_model.audio_encoder.layers.0.blocks.1.attention.output.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.1.output.dense.weight', 'audio_model.audio_encoder.layers.1.blocks.1.output.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.relative_position_index', 'text_model.encoder.layer.8.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.2.layernorm_after.weight', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.value.bias', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.layer.7.attention.self.value.weight', 'audio_model.audio_encoder.layers.0.blocks.1.layernorm_after.bias', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.query.bias', 'text_model.encoder.layer.2.attention.self.value.bias', 'audio_projection.linear1.bias', 'text_model.encoder.layer.0.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.relative_position_bias_table', 'text_model.encoder.layer.8.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.query.bias', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.value.bias', 'audio_model.audio_encoder.layers.0.blocks.1.output.dense.bias', 'text_model.encoder.layer.9.intermediate.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.1.attention.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.query.bias', 'text_model.encoder.layer.8.attention.self.key.weight', 'text_model.encoder.layer.1.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.5.layernorm_before.bias', 'text_model.encoder.layer.4.attention.self.key.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.3.blocks.1.attention.output.dense.weight', 'audio_model.audio_encoder.norm.bias', 'text_model.encoder.layer.7.attention.self.value.bias', 'text_model.encoder.layer.3.intermediate.dense.weight', 'text_model.encoder.layer.1.attention.output.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.value.weight', 'audio_model.audio_encoder.batch_norm.bias', 'text_model.encoder.layer.1.intermediate.dense.bias', 'audio_model.audio_encoder.batch_norm.num_batches_tracked', 'text_model.encoder.layer.1.output.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.0.attention.output.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.1.output.dense.weight', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.3.layernorm_after.bias', 'text_model.encoder.layer.6.intermediate.dense.weight', 'text_model.encoder.layer.8.intermediate.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.query.weight', 'text_model.encoder.layer.1.attention.output.LayerNorm.weight', 'text_model.encoder.layer.9.intermediate.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.query.bias', 'text_model.encoder.layer.6.attention.output.dense.weight', 'text_model.encoder.layer.6.attention.self.query.bias', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.key.bias', 'text_model.encoder.layer.4.attention.self.key.bias', 'text_model.encoder.layer.6.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.2.blocks.2.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.3.output.dense.weight', 'text_model.encoder.layer.7.attention.self.key.bias', 'text_model.encoder.layer.11.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.key.weight', 'text_model.encoder.layer.9.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.relative_position_bias_table', 'text_projection.linear2.weight', 'text_model.encoder.layer.0.attention.self.key.weight', 'text_model.encoder.layer.3.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.5.output.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.1.attention.output.dense.bias', 'text_model.encoder.layer.10.attention.output.LayerNorm.bias', 'text_model.encoder.layer.2.output.dense.weight', 'logit_scale_t', 'audio_model.audio_encoder.layers.2.blocks.5.layernorm_after.weight', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.value.weight', 'text_model.encoder.layer.8.output.LayerNorm.weight', 'text_model.encoder.layer.9.attention.output.dense.weight', 'text_model.encoder.layer.11.output.LayerNorm.weight', 'text_model.encoder.layer.8.attention.self.query.bias', 'audio_model.audio_encoder.layers.0.blocks.1.layernorm_before.weight', 'audio_model.audio_encoder.layers.0.blocks.1.intermediate.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.relative_position_index', 'text_model.embeddings.token_type_embeddings.weight', 'text_model.encoder.layer.4.attention.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.2.blocks.2.intermediate.dense.weight', 'audio_model.audio_encoder.layers.0.blocks.1.attention.output.dense.weight', 'text_model.encoder.layer.7.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.2.intermediate.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.key.bias', 'text_model.encoder.layer.9.output.LayerNorm.bias', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.key.weight', 'text_model.encoder.layer.9.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.0.attention.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.key.bias', 'audio_model.audio_encoder.layers.2.blocks.0.layernorm_before.weight', 'audio_model.audio_encoder.layers.0.blocks.1.layernorm_before.bias', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.value.weight', 'text_model.encoder.layer.2.attention.self.value.weight', 'text_model.encoder.layer.0.attention.self.query.weight', 'text_model.encoder.layer.5.attention.output.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.1.layernorm_after.weight', 'audio_model.audio_encoder.layers.3.blocks.1.intermediate.dense.weight', 'text_model.encoder.layer.1.output.LayerNorm.bias', 'audio_model.audio_encoder.layers.2.blocks.5.layernorm_before.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.query.weight', 'text_model.encoder.layer.2.attention.self.key.weight', 'text_model.encoder.layer.2.intermediate.dense.weight', 'text_model.encoder.layer.9.output.LayerNorm.weight', 'text_model.encoder.layer.11.attention.output.dense.weight', 'audio_model.audio_encoder.layers.1.blocks.0.layernorm_before.weight', 'text_model.encoder.layer.4.output.dense.weight', 'audio_model.audio_encoder.patch_embed.proj.weight', 'audio_model.audio_encoder.layers.1.blocks.1.layernorm_after.weight', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.value.bias', 'text_model.encoder.layer.4.attention.self.query.bias', 'text_model.encoder.layer.9.attention.output.dense.bias', 'text_model.encoder.layer.11.attention.output.dense.bias', 'text_model.encoder.layer.7.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.key.bias', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.value.weight', 'text_model.encoder.layer.5.attention.output.dense.weight', 'text_model.encoder.layer.0.attention.self.key.bias', 'audio_model.audio_encoder.layers.1.blocks.1.intermediate.dense.weight', 'text_model.encoder.layer.8.attention.output.dense.bias', 'text_model.encoder.layer.10.attention.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.1.blocks.0.attention.output.dense.weight', 'text_model.encoder.layer.5.intermediate.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.0.layernorm_after.bias', 'text_model.encoder.layer.10.output.dense.bias', 'audio_model.audio_encoder.layers.2.downsample.reduction.weight', 'text_model.encoder.layer.5.output.dense.weight', 'text_model.encoder.layer.9.attention.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.2.blocks.1.output.dense.bias', 'text_model.encoder.layer.8.attention.self.key.bias', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.query.weight', 'text_model.encoder.layer.3.intermediate.dense.bias', 'text_model.encoder.layer.6.output.LayerNorm.bias', 'text_model.encoder.layer.8.attention.output.dense.weight', 'text_model.encoder.layer.6.attention.self.value.bias', 'audio_model.audio_encoder.layers.0.blocks.1.layernorm_after.weight', 'text_model.encoder.layer.3.output.dense.weight', 'text_model.encoder.layer.7.attention.self.query.weight', 'audio_model.audio_encoder.layers.3.blocks.1.layernorm_after.bias', 'audio_model.audio_encoder.layers.3.blocks.0.output.dense.weight', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.3.blocks.0.layernorm_after.bias', 'text_model.encoder.layer.8.attention.self.value.bias', 'text_model.encoder.layer.4.output.LayerNorm.weight', 'text_model.encoder.layer.7.output.LayerNorm.bias', 'audio_model.audio_encoder.layers.2.blocks.5.output.dense.weight', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.1.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.4.layernorm_before.weight', 'text_model.encoder.layer.2.attention.self.query.weight', 'text_model.encoder.layer.5.attention.self.key.bias', 'audio_model.audio_encoder.layers.2.blocks.1.intermediate.dense.weight', 'text_model.encoder.layer.0.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.key.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.key.weight', 'audio_model.audio_encoder.layers.1.blocks.0.layernorm_before.bias', 'text_model.encoder.layer.5.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.3.blocks.0.layernorm_before.bias', 'audio_model.audio_encoder.layers.2.blocks.1.layernorm_before.weight', 'audio_model.audio_encoder.layers.2.blocks.2.layernorm_before.weight', 'text_model.encoder.layer.1.output.dense.weight', 'text_model.encoder.layer.4.attention.output.LayerNorm.bias', 'text_model.encoder.layer.10.output.dense.weight', 'text_model.encoder.layer.11.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.value.bias', 'text_model.encoder.layer.11.attention.self.key.bias', 'text_model.encoder.layer.10.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.5.intermediate.dense.bias', 'text_model.encoder.layer.5.intermediate.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.3.layernorm_after.weight', 'text_model.pooler.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.5.intermediate.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.0.intermediate.dense.bias', 'text_model.encoder.layer.3.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.1.layernorm_after.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.2.attention.output.dense.weight', 'text_model.encoder.layer.2.attention.output.LayerNorm.bias', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.query.weight', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.2.blocks.4.intermediate.dense.weight', 'text_model.pooler.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.0.intermediate.dense.bias', 'text_model.encoder.layer.2.intermediate.dense.bias', 'text_model.encoder.layer.2.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.key.bias', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.value.bias', 'text_model.encoder.layer.6.output.dense.bias', 'text_model.encoder.layer.1.attention.self.value.bias', 'text_model.encoder.layer.8.output.LayerNorm.bias', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.key.bias', 'text_model.encoder.layer.5.attention.self.value.bias', 'audio_model.audio_encoder.layers.3.blocks.0.intermediate.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.1.output.dense.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.relative_position_bias_table', 'text_model.encoder.layer.6.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.0.layernorm_before.bias', 'text_model.encoder.layer.6.intermediate.dense.bias', 'audio_projection.linear2.weight', 'text_model.encoder.layer.1.attention.output.dense.weight', 'text_model.encoder.layer.2.attention.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.1.downsample.norm.bias', 'text_model.encoder.layer.9.attention.self.value.bias', 'text_model.embeddings.position_ids', 'audio_model.audio_encoder.layers.1.blocks.0.intermediate.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.value.bias', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.value.weight', 'text_model.encoder.layer.0.attention.self.value.weight', 'audio_model.audio_encoder.layers.1.blocks.1.layernorm_before.weight', 'text_model.encoder.layer.0.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.4.intermediate.dense.bias', 'text_model.encoder.layer.10.attention.output.dense.weight', 'audio_model.audio_encoder.layers.3.blocks.0.output.dense.bias', 'text_model.encoder.layer.10.attention.self.value.bias', 'audio_model.audio_encoder.layers.3.blocks.1.layernorm_before.bias', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.0.attention.output.dense.bias', 'audio_model.audio_encoder.batch_norm.running_var', 'audio_model.audio_encoder.layers.3.blocks.0.intermediate.dense.weight', 'text_model.encoder.layer.9.attention.self.query.weight', 'text_model.encoder.layer.4.attention.output.dense.weight', 'text_model.encoder.layer.8.attention.output.LayerNorm.bias', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.3.blocks.1.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.self.value.bias', 'audio_projection.linear1.weight', 'text_model.encoder.layer.11.attention.output.LayerNorm.weight', 'text_model.encoder.layer.10.output.LayerNorm.bias', 'text_model.embeddings.LayerNorm.bias', 'text_model.encoder.layer.9.attention.output.LayerNorm.bias', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.value.bias', 'text_model.encoder.layer.9.attention.self.key.bias', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.relative_position_bias_table', 'text_model.encoder.layer.0.intermediate.dense.weight', 'audio_model.audio_encoder.layers.1.blocks.0.attention.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.4.attention.output.dense.weight', 'audio_model.audio_encoder.layers.1.downsample.reduction.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.value.bias', 'text_model.encoder.layer.7.output.dense.bias', 'text_model.encoder.layer.5.attention.output.LayerNorm.bias', 'text_model.encoder.layer.3.attention.output.dense.weight', 'text_projection.linear1.weight', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.value.bias', 'audio_model.audio_encoder.layers.0.blocks.1.intermediate.dense.weight', 'text_model.encoder.layer.9.attention.self.value.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.key.bias', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.key.bias', 'text_model.encoder.layer.11.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.4.layernorm_after.weight', 'text_model.encoder.layer.3.attention.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.query.bias', 'text_model.encoder.layer.5.output.LayerNorm.bias', 'text_model.encoder.layer.6.attention.self.key.bias', 'text_model.encoder.layer.4.attention.self.value.weight', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.relative_position_index', 'text_model.embeddings.LayerNorm.weight', 'audio_model.audio_encoder.layers.2.blocks.3.attention.output.dense.weight', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.key.bias', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.2.blocks.0.output.dense.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.relative_position_index', 'text_model.encoder.layer.5.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.3.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.3.intermediate.dense.bias', 'text_model.encoder.layer.1.attention.output.LayerNorm.bias', 'text_model.encoder.layer.3.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.query.weight', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.query.weight', 'audio_model.audio_encoder.layers.3.blocks.1.output.dense.bias', 'text_model.encoder.layer.7.attention.output.LayerNorm.bias', 'audio_model.audio_encoder.layers.2.downsample.norm.weight', 'audio_model.audio_encoder.layers.0.blocks.0.output.dense.weight', 'text_model.encoder.layer.10.intermediate.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.1.output.dense.weight', 'text_model.encoder.layer.0.output.LayerNorm.bias', 'text_model.encoder.layer.9.output.dense.weight', 'logit_scale_a', 'text_model.encoder.layer.0.attention.output.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.0.layernorm_before.weight', 'text_model.encoder.layer.7.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.value.bias', 'audio_model.audio_encoder.layers.2.blocks.1.layernorm_before.bias', 'text_model.encoder.layer.3.output.LayerNorm.bias', 'text_model.encoder.layer.9.attention.self.query.bias', 'text_model.encoder.layer.1.attention.self.value.weight', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.key.weight', 'audio_projection.linear2.bias', 'text_model.encoder.layer.3.attention.self.key.bias', 'audio_model.audio_encoder.patch_embed.norm.weight', 'text_model.encoder.layer.7.attention.output.dense.weight', 'audio_model.audio_encoder.norm.weight', 'text_model.encoder.layer.10.attention.output.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.0.layernorm_before.bias', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.key.bias', 'text_model.encoder.layer.3.attention.self.key.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.output.dense.weight', 'text_model.encoder.layer.1.intermediate.dense.weight', 'audio_model.audio_encoder.layers.0.downsample.reduction.weight', 'text_model.encoder.layer.5.attention.self.query.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.bias', 'audio_model.audio_encoder.patch_embed.norm.bias', 'audio_model.audio_encoder.layers.0.blocks.0.intermediate.dense.weight', 'text_model.encoder.layer.4.attention.self.value.bias', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.query.weight', 'text_model.encoder.layer.4.intermediate.dense.bias', 'text_model.encoder.layer.7.intermediate.dense.bias']\n", "- This IS expected if you are initializing ClapAudioModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing ClapAudioModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of ClapAudioModel were not initialized from the model checkpoint at laion/clap-htsat-unfused and are newly initialized: ['audio_encoder.layers.2.blocks.5.layernorm_before.bias', 'audio_encoder.layers.0.blocks.0.layernorm_before.weight', 'audio_encoder.layers.1.blocks.0.attention.self.query.weight', 'audio_encoder.layers.1.blocks.0.attention.self.value.bias', 'audio_encoder.layers.2.blocks.2.attention.self.relative_position_index', 'audio_encoder.layers.3.blocks.1.layernorm_after.bias', 'audio_encoder.layers.2.blocks.1.attention.output.dense.bias', 'audio_encoder.layers.2.blocks.4.attention.self.key.bias', 'audio_encoder.layers.1.blocks.1.attention.self.key.bias', 'audio_encoder.layers.2.blocks.2.attention.self.query.weight', 'audio_encoder.layers.2.blocks.2.attention.self.value.bias', 'audio_encoder.layers.1.downsample.reduction.weight', 'audio_encoder.layers.2.downsample.reduction.weight', 'audio_encoder.layers.2.blocks.2.attention.self.relative_position_bias_table', 'audio_encoder.layers.2.blocks.3.attention.self.key.bias', 'audio_encoder.layers.1.blocks.0.output.dense.weight', 'audio_encoder.layers.2.blocks.5.attention.self.value.weight', 'audio_encoder.layers.0.blocks.0.intermediate.dense.bias', 'audio_encoder.layers.1.blocks.0.attention.self.key.bias', 'audio_encoder.layers.1.blocks.0.attention.self.relative_position_bias_table', 'audio_encoder.layers.2.blocks.3.layernorm_before.weight', 'audio_encoder.layers.1.blocks.1.attention.self.query.weight', 'audio_encoder.layers.2.blocks.3.attention.self.key.weight', 'audio_encoder.layers.2.blocks.5.intermediate.dense.bias', 'audio_encoder.layers.3.blocks.0.layernorm_before.bias', 'audio_encoder.layers.3.blocks.0.attention.self.key.weight', 'audio_encoder.layers.0.blocks.1.attention.self.key.weight', 'audio_encoder.layers.0.blocks.0.layernorm_after.weight', 'audio_encoder.layers.0.blocks.1.output.dense.weight', 'audio_encoder.layers.2.blocks.1.attention.output.dense.weight', 'audio_encoder.layers.2.blocks.3.attention.output.dense.weight', 'audio_encoder.layers.2.blocks.4.layernorm_after.bias', 'audio_encoder.layers.0.blocks.1.attention.self.value.bias', 'audio_encoder.batch_norm.weight', 'audio_encoder.layers.2.blocks.1.attention.self.key.weight', 'audio_encoder.layers.2.blocks.5.attention.self.query.bias', 'audio_encoder.layers.0.blocks.0.attention.output.dense.weight', 'audio_encoder.layers.2.blocks.0.attention.self.relative_position_index', 'audio_encoder.layers.2.downsample.norm.weight', 'audio_encoder.layers.2.blocks.5.layernorm_before.weight', 'audio_encoder.layers.0.blocks.1.layernorm_before.bias', 'audio_encoder.layers.1.blocks.1.layernorm_after.weight', 'audio_encoder.layers.2.blocks.4.layernorm_before.bias', 'audio_encoder.layers.0.blocks.1.attention.self.key.bias', 'audio_encoder.norm.weight', 'audio_encoder.layers.0.downsample.reduction.weight', 'audio_encoder.layers.2.blocks.5.attention.self.key.bias', 'audio_encoder.layers.2.blocks.1.attention.self.query.bias', 'audio_encoder.layers.2.blocks.0.attention.self.query.weight', 'audio_encoder.layers.2.blocks.5.output.dense.weight', 'audio_encoder.layers.1.blocks.1.attention.self.query.bias', 'audio_encoder.layers.2.blocks.4.layernorm_after.weight', 'audio_encoder.layers.2.blocks.2.layernorm_after.weight', 'audio_encoder.layers.2.blocks.3.attention.self.relative_position_index', 'audio_encoder.layers.2.blocks.5.attention.output.dense.bias', 'audio_encoder.layers.3.blocks.0.attention.self.key.bias', 'audio_encoder.layers.0.blocks.0.layernorm_after.bias', 'audio_encoder.layers.2.blocks.4.attention.self.value.bias', 'audio_encoder.layers.2.blocks.5.attention.output.dense.weight', 'audio_encoder.layers.3.blocks.1.intermediate.dense.bias', 'audio_encoder.layers.2.downsample.norm.bias', 'audio_encoder.layers.2.blocks.1.layernorm_after.bias', 'audio_encoder.layers.2.blocks.4.attention.self.query.bias', 'audio_encoder.layers.2.blocks.2.output.dense.bias', 'audio_encoder.layers.2.blocks.4.attention.output.dense.weight', 'audio_encoder.layers.2.blocks.5.intermediate.dense.weight', 'audio_encoder.layers.2.blocks.0.intermediate.dense.bias', 'audio_encoder.layers.1.blocks.0.layernorm_after.bias', 'audio_encoder.layers.2.blocks.0.attention.self.relative_position_bias_table', 'audio_encoder.layers.3.blocks.0.output.dense.bias', 'audio_encoder.layers.2.blocks.0.layernorm_before.bias', 'audio_encoder.layers.2.blocks.0.attention.output.dense.weight', 'audio_encoder.layers.0.blocks.1.attention.output.dense.weight', 'audio_encoder.layers.1.blocks.0.attention.output.dense.weight', 'audio_encoder.layers.0.blocks.0.attention.self.relative_position_bias_table', 'audio_encoder.layers.3.blocks.0.attention.output.dense.weight', 'audio_encoder.layers.3.blocks.0.intermediate.dense.bias', 'audio_encoder.layers.2.blocks.0.output.dense.weight', 'audio_encoder.layers.3.blocks.1.attention.self.value.weight', 'audio_encoder.layers.2.blocks.2.layernorm_before.bias', 'audio_encoder.layers.1.blocks.0.layernorm_before.weight', 'audio_encoder.layers.2.blocks.4.intermediate.dense.weight', 'audio_encoder.layers.0.blocks.0.layernorm_before.bias', 'audio_encoder.layers.2.blocks.2.layernorm_before.weight', 'audio_encoder.batch_norm.num_batches_tracked', 'audio_encoder.layers.1.blocks.0.layernorm_after.weight', 'audio_encoder.layers.2.blocks.4.attention.self.relative_position_index', 'audio_encoder.layers.1.blocks.0.output.dense.bias', 'audio_encoder.layers.1.blocks.1.attention.self.value.weight', 'audio_encoder.layers.2.blocks.0.layernorm_before.weight', 'audio_encoder.layers.3.blocks.0.attention.output.dense.bias', 'audio_encoder.layers.3.blocks.1.attention.output.dense.bias', 'audio_encoder.layers.0.blocks.1.output.dense.bias', 'audio_encoder.layers.2.blocks.3.attention.self.query.weight', 'audio_encoder.layers.2.blocks.3.layernorm_after.weight', 'audio_encoder.layers.3.blocks.1.output.dense.bias', 'audio_encoder.layers.1.blocks.1.layernorm_before.weight', 'audio_encoder.layers.2.blocks.3.intermediate.dense.bias', 'audio_encoder.layers.0.blocks.0.attention.self.query.weight', 'audio_encoder.layers.1.blocks.0.attention.self.relative_position_index', 'audio_encoder.layers.2.blocks.4.output.dense.weight', 'audio_encoder.layers.2.blocks.2.intermediate.dense.bias', 'audio_encoder.layers.2.blocks.3.layernorm_after.bias', 'audio_encoder.layers.2.blocks.4.attention.output.dense.bias', 'audio_encoder.layers.2.blocks.5.layernorm_after.weight', 'audio_encoder.layers.3.blocks.1.output.dense.weight', 'audio_encoder.layers.2.blocks.2.attention.self.key.bias', 'audio_encoder.layers.2.blocks.2.intermediate.dense.weight', 'audio_encoder.layers.0.downsample.norm.weight', 'audio_encoder.layers.0.blocks.0.intermediate.dense.weight', 'audio_encoder.layers.3.blocks.0.layernorm_after.weight', 'audio_encoder.layers.2.blocks.1.attention.self.relative_position_bias_table', 'audio_encoder.layers.0.blocks.0.attention.self.relative_position_index', 'audio_encoder.layers.2.blocks.0.layernorm_after.bias', 'audio_encoder.layers.0.blocks.0.attention.self.value.bias', 'audio_encoder.layers.2.blocks.3.attention.self.query.bias', 'audio_encoder.layers.2.blocks.5.attention.self.relative_position_bias_table', 'audio_encoder.layers.2.blocks.0.attention.self.key.bias', 'audio_encoder.layers.2.blocks.1.layernorm_before.bias', 'audio_encoder.batch_norm.running_var', 'audio_encoder.layers.2.blocks.2.attention.output.dense.bias', 'audio_encoder.layers.2.blocks.4.layernorm_before.weight', 'audio_encoder.layers.3.blocks.0.attention.self.relative_position_bias_table', 'audio_encoder.layers.2.blocks.3.attention.output.dense.bias', 'audio_encoder.layers.1.blocks.1.output.dense.bias', 'audio_encoder.layers.2.blocks.3.attention.self.relative_position_bias_table', 'audio_encoder.layers.2.blocks.2.attention.self.query.bias', 'audio_encoder.layers.2.blocks.0.attention.output.dense.bias', 'audio_encoder.layers.0.blocks.0.attention.output.dense.bias', 'audio_encoder.layers.1.blocks.1.output.dense.weight', 'audio_encoder.layers.2.blocks.3.intermediate.dense.weight', 'audio_encoder.layers.0.blocks.1.attention.self.relative_position_index', 'audio_encoder.patch_embed.norm.weight', 'audio_encoder.layers.2.blocks.1.attention.self.query.weight', 'audio_encoder.layers.1.blocks.1.attention.self.key.weight', 'audio_encoder.layers.2.blocks.1.attention.self.value.bias', 'audio_encoder.layers.0.blocks.1.intermediate.dense.weight', 'audio_encoder.layers.0.blocks.1.layernorm_after.bias', 'audio_encoder.layers.1.blocks.0.attention.output.dense.bias', 'audio_encoder.layers.0.blocks.1.layernorm_after.weight', 'audio_encoder.layers.0.blocks.0.output.dense.weight', 'audio_encoder.layers.2.blocks.1.layernorm_before.weight', 'audio_encoder.layers.2.blocks.3.layernorm_before.bias', 'audio_encoder.layers.0.blocks.0.attention.self.value.weight', 'audio_encoder.layers.2.blocks.1.output.dense.bias', 'audio_encoder.layers.2.blocks.3.output.dense.weight', 'audio_encoder.layers.1.blocks.0.intermediate.dense.bias', 'audio_encoder.layers.2.blocks.3.output.dense.bias', 'audio_encoder.layers.3.blocks.0.attention.self.relative_position_index', 'audio_encoder.layers.2.blocks.5.attention.self.query.weight', 'audio_encoder.layers.2.blocks.1.intermediate.dense.bias', 'audio_encoder.layers.0.blocks.1.attention.output.dense.bias', 'audio_encoder.layers.2.blocks.2.attention.self.key.weight', 'audio_encoder.layers.2.blocks.4.output.dense.bias', 'audio_encoder.layers.3.blocks.1.attention.self.query.weight', 'audio_encoder.patch_embed.proj.bias', 'audio_encoder.layers.0.blocks.0.attention.self.query.bias', 'audio_encoder.layers.1.blocks.1.attention.self.relative_position_bias_table', 'audio_encoder.layers.2.blocks.4.attention.self.relative_position_bias_table', 'audio_encoder.layers.2.blocks.3.attention.self.value.weight', 'audio_encoder.layers.1.blocks.0.layernorm_before.bias', 'audio_encoder.layers.0.blocks.1.attention.self.query.weight', 'audio_encoder.layers.1.blocks.1.attention.self.relative_position_index', 'audio_encoder.layers.3.blocks.1.layernorm_after.weight', 'audio_encoder.layers.3.blocks.1.attention.self.value.bias', 'audio_encoder.layers.2.blocks.5.attention.self.relative_position_index', 'audio_encoder.layers.3.blocks.1.layernorm_before.bias', 'audio_encoder.layers.3.blocks.1.attention.self.relative_position_bias_table', 'audio_encoder.layers.1.blocks.0.attention.self.key.weight', 'audio_encoder.layers.0.downsample.norm.bias', 'audio_encoder.layers.2.blocks.5.output.dense.bias', 'audio_encoder.layers.3.blocks.0.layernorm_before.weight', 'audio_encoder.layers.2.blocks.0.attention.self.query.bias', 'audio_encoder.layers.3.blocks.0.output.dense.weight', 'audio_encoder.batch_norm.bias', 'audio_encoder.layers.2.blocks.0.attention.self.value.bias', 'audio_encoder.layers.2.blocks.1.intermediate.dense.weight', 'audio_encoder.layers.1.blocks.1.attention.output.dense.bias', 'audio_encoder.layers.0.blocks.1.attention.self.relative_position_bias_table', 'audio_encoder.layers.1.blocks.1.intermediate.dense.weight', 'audio_encoder.layers.1.blocks.1.attention.output.dense.weight', 'audio_encoder.layers.1.blocks.1.intermediate.dense.bias', 'audio_encoder.layers.2.blocks.1.layernorm_after.weight', 'audio_encoder.layers.3.blocks.1.attention.self.key.bias', 'audio_encoder.layers.1.blocks.1.layernorm_after.bias', 'audio_encoder.layers.3.blocks.1.intermediate.dense.weight', 'audio_encoder.layers.2.blocks.5.attention.self.value.bias', 'audio_encoder.layers.3.blocks.0.attention.self.query.bias', 'audio_encoder.layers.2.blocks.5.attention.self.key.weight', 'audio_encoder.layers.3.blocks.1.attention.self.key.weight', 'audio_encoder.batch_norm.running_mean', 'audio_encoder.norm.bias', 'audio_encoder.layers.3.blocks.0.attention.self.query.weight', 'audio_encoder.layers.3.blocks.1.attention.self.relative_position_index', 'audio_encoder.layers.2.blocks.1.attention.self.key.bias', 'audio_encoder.layers.2.blocks.4.attention.self.key.weight', 'audio_encoder.layers.0.blocks.1.attention.self.value.weight', 'audio_encoder.layers.3.blocks.0.layernorm_after.bias', 'audio_encoder.layers.0.blocks.1.intermediate.dense.bias', 'audio_encoder.layers.2.blocks.0.attention.self.value.weight', 'audio_encoder.layers.0.blocks.1.attention.self.query.bias', 'audio_encoder.layers.0.blocks.0.output.dense.bias', 'audio_encoder.layers.2.blocks.2.output.dense.weight', 'audio_encoder.layers.1.downsample.norm.bias', 'audio_encoder.layers.2.blocks.3.attention.self.value.bias', 'audio_encoder.patch_embed.proj.weight', 'audio_encoder.layers.2.blocks.0.attention.self.key.weight', 'audio_encoder.layers.2.blocks.1.attention.self.relative_position_index', 'audio_encoder.patch_embed.norm.bias', 'audio_encoder.layers.1.blocks.0.attention.self.query.bias', 'audio_encoder.layers.1.blocks.1.layernorm_before.bias', 'audio_encoder.layers.2.blocks.4.attention.self.value.weight', 'audio_encoder.layers.2.blocks.4.attention.self.query.weight', 'audio_encoder.layers.2.blocks.2.attention.self.value.weight', 'audio_encoder.layers.3.blocks.0.attention.self.value.bias', 'audio_encoder.layers.2.blocks.1.output.dense.weight', 'audio_encoder.layers.3.blocks.1.layernorm_before.weight', 'audio_encoder.layers.2.blocks.1.attention.self.value.weight', 'audio_encoder.layers.1.blocks.0.attention.self.value.weight', 'audio_encoder.layers.1.blocks.1.attention.self.value.bias', 'audio_encoder.layers.1.blocks.0.intermediate.dense.weight', 'audio_encoder.layers.2.blocks.0.output.dense.bias', 'audio_encoder.layers.0.blocks.1.layernorm_before.weight', 'audio_encoder.layers.2.blocks.2.attention.output.dense.weight', 'audio_encoder.layers.0.blocks.0.attention.self.key.weight', 'audio_encoder.layers.2.blocks.2.layernorm_after.bias', 'audio_encoder.layers.2.blocks.0.layernorm_after.weight', 'audio_encoder.layers.2.blocks.5.layernorm_after.bias', 'audio_encoder.layers.2.blocks.4.intermediate.dense.bias', 'audio_encoder.layers.3.blocks.0.intermediate.dense.weight', 'audio_encoder.layers.3.blocks.0.attention.self.value.weight', 'audio_encoder.layers.2.blocks.0.intermediate.dense.weight', 'audio_encoder.layers.3.blocks.1.attention.output.dense.weight', 'audio_encoder.layers.1.downsample.norm.weight', 'audio_encoder.layers.0.blocks.0.attention.self.key.bias', 'audio_encoder.layers.3.blocks.1.attention.self.query.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "audio_model = ClapAudioModel.from_pretrained(\"laion/clap-htsat-unfused\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n" ] } ], "source": [ "audio_inputs = processor(audios=audio_sample, return_tensors='pt')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['input_features', 'is_longer'])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio_inputs.keys()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at laion/clap-htsat-unfused were not used when initializing ClapAudioModelWithProjection: ['text_model.encoder.layer.8.output.dense.bias', 'text_model.encoder.layer.2.attention.self.value.weight', 'text_model.encoder.layer.0.attention.self.query.weight', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.layer.7.attention.self.value.weight', 'text_model.encoder.layer.5.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'text_model.encoder.layer.2.attention.self.value.bias', 'text_model.encoder.layer.6.attention.self.query.weight', 'text_model.encoder.layer.6.intermediate.dense.bias', 'text_model.encoder.layer.0.attention.self.query.bias', 'text_model.encoder.layer.1.attention.output.dense.weight', 'text_model.encoder.layer.2.attention.output.LayerNorm.weight', 'text_model.encoder.layer.1.output.LayerNorm.bias', 'text_model.encoder.layer.7.attention.output.dense.bias', 'text_model.encoder.layer.9.attention.self.value.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layer.0.intermediate.dense.bias', 'text_model.encoder.layer.8.attention.self.query.weight', 'text_model.encoder.layer.0.attention.self.value.weight', 'text_model.encoder.layer.0.output.dense.weight', 'text_model.encoder.layer.1.output.LayerNorm.weight', 'text_model.encoder.layer.5.attention.output.LayerNorm.weight', 'text_model.encoder.layer.2.attention.self.key.weight', 'text_model.encoder.layer.10.attention.output.dense.weight', 'text_model.encoder.layer.10.attention.self.value.bias', 'text_model.encoder.layer.2.intermediate.dense.weight', 'text_model.encoder.layer.9.intermediate.dense.weight', 'text_model.encoder.layer.11.intermediate.dense.weight', 'text_model.encoder.layer.10.attention.self.query.bias', 'text_model.encoder.layer.9.output.LayerNorm.weight', 'text_model.encoder.layer.11.attention.output.dense.weight', 'text_model.encoder.layer.2.attention.output.dense.bias', 'text_model.encoder.layer.11.attention.self.value.bias', 'text_model.encoder.layer.4.attention.output.dense.bias', 'text_model.encoder.layer.7.attention.self.query.bias', 'text_model.encoder.layer.11.attention.output.LayerNorm.bias', 'text_model.encoder.layer.2.attention.output.dense.weight', 'text_model.encoder.layer.10.attention.self.key.bias', 'text_model.encoder.layer.4.output.dense.weight', 'text_model.encoder.layer.9.attention.self.query.weight', 'text_model.encoder.layer.8.attention.self.key.weight', 'text_model.encoder.layer.4.attention.output.dense.weight', 'text_model.encoder.layer.8.attention.output.LayerNorm.bias', 'text_model.encoder.layer.6.attention.output.LayerNorm.weight', 'text_model.embeddings.position_embeddings.weight', 'text_model.encoder.layer.1.attention.self.query.weight', 'text_model.encoder.layer.0.attention.self.value.bias', 'text_model.encoder.layer.5.attention.self.value.weight', 'text_model.encoder.layer.10.attention.self.query.weight', 'text_model.encoder.layer.4.attention.self.key.weight', 'text_model.encoder.layer.4.attention.self.query.bias', 'text_model.encoder.layer.3.output.dense.bias', 'text_model.encoder.layer.9.attention.output.dense.bias', 'text_model.encoder.layer.2.output.dense.bias', 'text_model.encoder.layer.11.attention.output.LayerNorm.weight', 'text_model.encoder.layer.10.output.LayerNorm.bias', 'text_model.encoder.layer.11.attention.output.dense.bias', 'text_model.embeddings.LayerNorm.bias', 'text_model.encoder.layer.7.output.LayerNorm.weight', 'text_model.encoder.layer.9.attention.output.LayerNorm.bias', 'text_model.encoder.layer.5.attention.output.dense.weight', 'text_model.encoder.layer.1.attention.self.key.bias', 'text_model.encoder.layer.0.attention.self.key.bias', 'text_model.encoder.layer.9.attention.self.key.bias', 'text_model.encoder.layer.8.attention.output.dense.bias', 'text_model.encoder.layer.0.intermediate.dense.weight', 'text_model.encoder.layer.2.attention.self.query.bias', 'text_model.encoder.layer.5.attention.self.key.weight', 'text_model.encoder.layer.7.attention.self.value.bias', 'text_model.encoder.layer.11.intermediate.dense.bias', 'text_model.encoder.layer.10.attention.output.LayerNorm.weight', 'text_model.encoder.layer.5.intermediate.dense.bias', 'text_model.encoder.layer.6.attention.output.LayerNorm.bias', 'text_model.encoder.layer.10.output.dense.bias', 'text_model.encoder.layer.3.intermediate.dense.weight', 'text_model.encoder.layer.5.output.dense.weight', 'text_model.encoder.layer.9.attention.output.LayerNorm.weight', 'text_model.encoder.layer.3.output.LayerNorm.weight', 'text_model.encoder.layer.1.attention.output.dense.bias', 'text_model.encoder.layer.0.output.dense.bias', 'text_model.encoder.layer.1.intermediate.dense.bias', 'text_model.encoder.layer.8.attention.self.key.bias', 'text_model.encoder.layer.10.output.LayerNorm.weight', 'text_model.encoder.layer.2.attention.self.key.bias', 'text_model.encoder.layer.6.output.dense.weight', 'text_model.encoder.layer.8.attention.output.LayerNorm.weight', 'text_model.encoder.layer.4.output.LayerNorm.bias', 'text_model.encoder.layer.1.output.dense.bias', 'text_model.encoder.layer.3.intermediate.dense.bias', 'text_model.encoder.layer.8.output.dense.weight', 'text_model.encoder.layer.6.output.LayerNorm.bias', 'text_model.encoder.layer.8.attention.output.dense.weight', 'text_model.encoder.layer.6.attention.self.value.bias', 'text_model.encoder.layer.7.output.dense.bias', 'text_model.encoder.layer.5.attention.output.LayerNorm.bias', 'text_model.encoder.layer.3.attention.output.dense.weight', 'text_model.encoder.layer.6.intermediate.dense.weight', 'text_projection.linear1.weight', 'text_model.encoder.layer.3.output.dense.weight', 'text_model.encoder.layer.8.intermediate.dense.weight', 'text_model.encoder.layer.7.attention.self.query.weight', 'text_model.encoder.layer.1.attention.output.LayerNorm.weight', 'text_model.encoder.layer.9.intermediate.dense.bias', 'text_model.encoder.layer.6.attention.output.dense.weight', 'text_model.encoder.layer.8.attention.self.value.bias', 'text_model.encoder.layer.9.attention.self.value.weight', 'text_model.encoder.layer.4.output.LayerNorm.weight', 'text_model.encoder.layer.7.output.LayerNorm.bias', 'text_model.encoder.layer.11.output.dense.bias', 'text_model.encoder.layer.6.attention.self.query.bias', 'text_model.encoder.layer.2.attention.self.query.weight', 'text_model.encoder.layer.5.attention.self.key.bias', 'text_model.encoder.layer.3.attention.output.dense.bias', 'text_model.encoder.layer.5.output.LayerNorm.bias', 'text_model.encoder.layer.4.attention.self.key.bias', 'text_model.encoder.layer.7.intermediate.dense.weight', 'text_model.encoder.layer.6.output.LayerNorm.weight', 'text_model.encoder.layer.6.attention.self.key.bias', 'text_model.encoder.layer.1.attention.self.key.weight', 'text_model.encoder.layer.4.attention.self.value.weight', 'text_model.encoder.layer.6.attention.self.value.weight', 'text_model.encoder.layer.7.attention.self.key.bias', 'text_model.encoder.layer.11.attention.self.query.bias', 'text_model.encoder.layer.0.output.LayerNorm.weight', 'text_model.encoder.layer.11.output.LayerNorm.bias', 'text_model.embeddings.LayerNorm.weight', 'text_model.encoder.layer.5.output.LayerNorm.weight', 'text_model.encoder.layer.3.attention.self.value.bias', 'text_model.encoder.layer.3.attention.output.LayerNorm.bias', 'text_model.encoder.layer.9.attention.self.key.weight', 'text_projection.linear2.weight', 'text_model.encoder.layer.1.output.dense.weight', 'text_model.encoder.layer.4.attention.output.LayerNorm.bias', 'text_model.encoder.layer.5.attention.self.query.bias', 'text_model.encoder.layer.10.output.dense.weight', 'text_model.encoder.layer.1.attention.output.LayerNorm.bias', 'text_model.encoder.layer.0.attention.self.key.weight', 'text_model.encoder.layer.11.attention.self.key.weight', 'text_model.encoder.layer.11.attention.self.key.bias', 'text_model.encoder.layer.10.attention.self.value.weight', 'text_model.encoder.layer.3.attention.self.value.weight', 'text_model.encoder.layer.3.attention.self.query.bias', 'text_model.encoder.layer.5.intermediate.dense.weight', 'text_model.encoder.layer.7.attention.output.LayerNorm.bias', 'text_model.encoder.layer.4.intermediate.dense.weight', 'text_model.pooler.dense.weight', 'text_model.encoder.layer.10.attention.output.LayerNorm.bias', 'text_projection.linear2.bias', 'text_model.encoder.layer.4.attention.self.query.weight', 'text_model.encoder.layer.2.output.dense.weight', 'text_model.encoder.layer.3.attention.self.query.weight', 'logit_scale_t', 'text_model.encoder.layer.3.attention.output.LayerNorm.weight', 'text_model.encoder.layer.10.intermediate.dense.bias', 'text_model.encoder.layer.0.output.LayerNorm.bias', 'text_model.encoder.layer.7.attention.output.LayerNorm.weight', 'text_model.encoder.layer.9.output.dense.weight', 'logit_scale_a', 'text_model.encoder.layer.6.attention.self.key.weight', 'text_projection.linear1.bias', 'text_model.encoder.layer.8.output.LayerNorm.weight', 'text_model.encoder.layer.9.attention.output.dense.weight', 'text_model.encoder.layer.0.attention.output.dense.bias', 'text_model.encoder.layer.11.output.LayerNorm.weight', 'text_model.encoder.layer.7.attention.self.key.weight', 'text_model.encoder.layer.3.output.LayerNorm.bias', 'text_model.encoder.layer.9.attention.self.query.bias', 'text_model.encoder.layer.8.attention.self.value.weight', 'text_model.encoder.layer.8.attention.self.query.bias', 'text_model.encoder.layer.1.attention.self.value.weight', 'text_model.encoder.layer.8.intermediate.dense.bias', 'text_model.embeddings.token_type_embeddings.weight', 'text_model.encoder.layer.2.attention.output.LayerNorm.bias', 'text_model.encoder.layer.4.attention.output.LayerNorm.weight', 'text_model.encoder.layer.3.attention.self.key.bias', 'text_model.encoder.layer.7.output.dense.weight', 'text_model.encoder.layer.7.attention.output.dense.weight', 'text_model.encoder.layer.10.attention.self.key.weight', 'text_model.encoder.layer.10.attention.output.dense.bias', 'text_model.pooler.dense.bias', 'text_model.encoder.layer.9.output.LayerNorm.bias', 'text_model.encoder.layer.4.output.dense.bias', 'text_model.embeddings.word_embeddings.weight', 'text_model.encoder.layer.2.intermediate.dense.bias', 'text_model.encoder.layer.2.output.LayerNorm.weight', 'text_model.encoder.layer.3.attention.self.key.weight', 'text_model.embeddings.token_type_ids', 'text_model.encoder.layer.9.output.dense.bias', 'text_model.encoder.layer.1.intermediate.dense.weight', 'text_model.encoder.layer.5.attention.self.query.weight', 'text_model.encoder.layer.6.attention.output.dense.bias', 'text_model.encoder.layer.10.intermediate.dense.weight', 'text_model.encoder.layer.5.output.dense.bias', 'text_model.encoder.layer.6.output.dense.bias', 'text_model.encoder.layer.1.attention.self.value.bias', 'text_model.encoder.layer.11.attention.self.value.weight', 'text_model.encoder.layer.1.attention.self.query.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.bias', 'text_model.encoder.layer.11.output.dense.weight', 'text_model.encoder.layer.8.output.LayerNorm.bias', 'text_model.encoder.layer.11.attention.self.query.weight', 'text_model.encoder.layer.2.output.LayerNorm.bias', 'text_model.encoder.layer.4.attention.self.value.bias', 'text_model.encoder.layer.5.attention.self.value.bias', 'text_model.encoder.layer.4.intermediate.dense.bias', 'text_model.encoder.layer.7.intermediate.dense.bias']\n", "- This IS expected if you are initializing ClapAudioModelWithProjection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing ClapAudioModelWithProjection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] } ], "source": [ "audio_prediction_model = ClapAudioModelWithProjection.from_pretrained(\"laion/clap-htsat-unfused\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "from transformers import CLAP_PRETRAINED_MODEL_ARCHIVE_LIST" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "pred_outputs = audio_prediction_model(**audio_inputs)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "odict_keys(['audio_embeds', 'last_hidden_state'])\n" ] } ], "source": [ "print(pred_outputs.keys())" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[-7.9439e-02, 2.3935e-01, 3.5846e-01, 2.3282e-01, 3.2885e-02,\n", " -1.3462e-01, -3.8317e-01, -5.1633e-02, -1.8881e-03, 1.4470e-01,\n", " 1.5499e-01, -5.4301e-03, 6.2472e-02, 1.1324e-01, -1.3372e-01,\n", " -7.4772e-02, 9.4837e-02, 8.4011e-02, 1.6877e-01, 3.9500e-01,\n", " 3.7919e-01, 4.1101e-01, -2.2619e-01, 2.6106e-01, 7.4054e-02,\n", " 6.7051e-02, -4.6973e-02, 6.6229e-02, 7.9341e-02, -6.2507e-02,\n", " -3.3600e-02, -1.1131e-02, 2.9025e-01, -1.0942e-01, -6.2347e-02,\n", " -4.0657e-02, 3.6304e-02, 3.1982e-02, -4.5375e-02, -3.1386e-01,\n", " 1.8173e-01, -1.8351e-01, -3.7267e-01, -2.3658e-01, 5.7322e-02,\n", " -2.3966e-04, -1.6086e-01, -1.8752e-01, 3.9222e-01, -2.7590e-01,\n", " 2.3425e-01, 5.2686e-02, 1.1264e-01, 1.1232e-01, -9.5137e-02,\n", " 1.2332e-01, 3.2688e-01, -3.4500e-02, 3.2825e-01, 1.3025e-01,\n", " 1.6063e-01, -2.2567e-01, -1.5062e-01, -3.4971e-01, 2.3765e-01,\n", " -1.4173e-01, 4.0352e-02, 3.6305e-02, -1.8367e-01, -4.1525e-02,\n", " -1.0561e-01, 8.5074e-02, 1.6497e-01, 7.2744e-02, 2.4250e-01,\n", " 7.6457e-02, 3.6339e-02, -2.8053e-02, 2.4540e-01, 6.7366e-02,\n", " 2.6156e-02, -5.1896e-02, -9.8726e-02, -2.8503e-02, -1.9302e-01,\n", " 2.2511e-01, -1.0908e-01, -2.2838e-01, -9.8607e-02, -1.0154e-01,\n", " -2.3456e-01, -2.7670e-01, -4.9566e-02, 2.6733e-02, 2.0004e-01,\n", " -6.8596e-02, -8.9384e-02, 7.1089e-02, -2.2231e-01, 1.7140e-01,\n", " -5.8561e-02, 3.0899e-01, -1.6953e-01, -5.2759e-02, -1.7031e-01,\n", " 2.3570e-01, 1.3576e-01, -1.1275e-01, 2.9381e-03, 3.3434e-02,\n", " -3.3410e-02, -1.3793e-01, 1.5729e-01, 2.8559e-02, -8.7023e-02,\n", " 2.4856e-02, 4.8679e-01, 6.0615e-02, 7.7202e-02, 1.0199e-02,\n", " -6.4608e-02, 3.9223e-02, -2.6339e-01, -1.8986e-01, -8.0283e-02,\n", " -1.7649e-01, -2.2097e-01, 1.2193e-01, 1.8228e-02, -2.0657e-01,\n", " 8.1092e-02, 3.3672e-01, 2.0459e-01, -8.5797e-03, -1.0478e-01,\n", " -3.8157e-01, 8.3080e-02, 7.8855e-02, -1.4378e-01, 1.5124e-01,\n", " 5.6382e-02, -4.6423e-01, 3.2397e-02, -1.1613e-01, 9.0706e-02,\n", " -3.0016e-02, 2.4319e-02, 2.0078e-01, 1.8076e-01, 2.3837e-02,\n", " -9.1573e-03, -1.0591e-01, 1.2748e-01, 1.3915e-01, -3.7778e-02,\n", " 6.4304e-02, 2.1202e-02, 2.5914e-01, 1.4441e-04, -3.0887e-01,\n", " -1.2447e-02, 3.0046e-01, 1.3040e-02, -1.6957e-01, 4.1824e-01,\n", " -1.3606e-01, -1.8628e-01, 6.8618e-02, 2.2592e-04, 1.3478e-01,\n", " 1.8527e-01, 7.3277e-02, -3.6254e-01, -1.7390e-01, -1.2030e-01,\n", " -3.6745e-01, 1.3862e-02, -8.1357e-02, 2.4806e-01, -1.0433e-01,\n", " -1.3357e-01, 9.9876e-02, 2.9834e-02, 6.2674e-02, -5.8856e-02,\n", " -2.2021e-01, -1.4425e-01, 9.3113e-02, 1.0902e-01, 2.2895e-01,\n", " -9.8941e-03, 2.2718e-01, -6.9984e-02, 2.3527e-02, -2.7299e-01,\n", " 2.0526e-01, -1.9043e-02, -7.9463e-03, -1.0445e-01, 2.2310e-01,\n", " 4.7663e-02, -1.6041e-01, -2.2408e-01, -4.2629e-01, 1.1160e-01,\n", " -1.9787e-01, 2.6180e-02, 1.1099e-01, 9.3960e-02, 6.4820e-02,\n", " 1.0989e-01, 3.0832e-01, 6.3034e-02, -3.0771e-01, 2.6873e-01,\n", " 4.8435e-01, 3.2067e-01, 2.1355e-01, 7.5662e-03, 7.0944e-02,\n", " -7.1933e-02, -3.1401e-01, -1.4177e-02, -3.6178e-02, 1.1808e-01,\n", " 4.2822e-01, -4.9530e-02, 4.3974e-01, -2.0961e-01, 4.0624e-02,\n", " -4.0375e-01, 2.6199e-01, -2.3909e-02, 1.2895e-01, -1.0948e-01,\n", " 2.4784e-01, 4.0905e-01, -1.3688e-01, -7.3436e-02, -2.9621e-01,\n", " 2.2015e-01, 1.7667e-01, 6.6805e-02, -2.8429e-01, -1.4649e-01,\n", " 7.8780e-02, -1.2312e-01, 6.5464e-02, -2.2593e-02, 5.2024e-03,\n", " -4.4953e-01, 1.8504e-01, 1.1099e-02, -1.9407e-01, 1.4623e-01,\n", " -8.9826e-02, -5.6768e-02, 1.7040e-01, -2.0102e-01, 1.6553e-01,\n", " 9.1688e-02, -1.9440e-01, -2.4224e-01, 1.9018e-01, 3.1085e-02,\n", " 6.3201e-02, -6.5082e-03, 3.1631e-03, 4.7110e-01, -3.8786e-01,\n", " -2.5261e-01, -9.6393e-02, -3.5396e-01, -2.1949e-01, -8.3369e-02,\n", " 3.6209e-02, -1.9848e-01, 9.4422e-04, 1.4923e-01, -1.8843e-01,\n", " -4.0047e-01, 1.5708e-01, 4.0416e-01, 3.2648e-02, 3.5443e-02,\n", " -3.0803e-01, 1.4596e-02, 2.6652e-01, 5.5408e-03, 1.7115e-01,\n", " 1.4370e-01, 7.0149e-03, 3.3037e-01, 7.7654e-03, 3.1744e-03,\n", " 3.9382e-01, 2.6399e-01, -2.5528e-02, 2.2868e-01, -7.3748e-02,\n", " -1.1027e-01, -4.3519e-01, -1.2246e-01, 8.4128e-02, -3.1929e-01,\n", " 1.8925e-01, 9.2093e-02, 2.0888e-01, 8.9843e-02, -1.1410e-01,\n", " -1.4896e-01, 2.5390e-01, -2.2502e-01, 2.4021e-01, 3.7792e-01,\n", " 1.9964e-01, -3.7881e-02, -1.1248e-01, -8.0672e-02, 1.2847e-01,\n", " -1.2800e-01, -4.9597e-02, 7.6726e-02, -4.7070e-03, 2.4474e-02,\n", " 7.4982e-02, 1.8485e-01, -9.9503e-02, -5.9090e-03, 3.3012e-01,\n", " -3.7215e-02, -1.7957e-02, -2.1178e-01, -4.5230e-01, 7.5678e-02,\n", " 2.3351e-02, -3.6213e-01, 8.2206e-02, -3.0158e-02, 1.1234e-01,\n", " -1.3789e-02, 3.9936e-01, -1.8714e-01, -2.4982e-01, 2.2756e-01,\n", " 3.8608e-02, -5.2066e-01, -8.4050e-02, -1.8992e-01, -2.3252e-02,\n", " -6.0313e-02, -1.5967e-01, 1.8432e-01, -7.2125e-02, -3.4616e-01,\n", " 1.9646e-01, -1.9097e-01, 1.1512e-01, -2.2584e-01, -1.6386e-01,\n", " 2.3796e-01, -2.1440e-01, -1.2167e-01, 2.0676e-01, -1.6048e-02,\n", " -1.3429e-01, 1.7426e-01, -3.7598e-01, 3.5905e-01, -4.3189e-01,\n", " -1.3992e-01, -1.1989e-01, -3.5211e-04, 1.8878e-01, -2.9083e-02,\n", " -4.9331e-02, 9.3431e-02, 9.2405e-02, 2.9913e-01, 3.1829e-01,\n", " -1.2953e-01, 2.8767e-01, 1.9968e-01, -9.2311e-03, 2.3972e-02,\n", " -7.4590e-02, 2.5610e-01, -1.4576e-01, 1.1054e-01, 3.5539e-01,\n", " -3.1111e-01, 3.5485e-01, 5.3879e-02, -3.6065e-02, -9.2444e-02,\n", " -9.3248e-02, -1.1581e-04, -3.0920e-02, -4.3172e-01, -4.4891e-02,\n", " 8.6953e-02, -5.8378e-02, -1.0187e-01, 3.2385e-02, -5.6473e-02,\n", " 2.9112e-01, 9.7588e-03, -7.0611e-02, 1.7078e-01, -2.4374e-01,\n", " 7.8363e-03, 2.5003e-01, 7.5210e-02, -1.5873e-01, -1.3510e-01,\n", " -1.0092e-01, 9.0069e-02, 3.3861e-01, 3.5654e-02, 1.0590e-02,\n", " -1.6469e-01, 3.1928e-02, -6.0115e-02, 3.4949e-02, 1.2862e-01,\n", " -7.5423e-03, 4.4245e-01, 2.2163e-01, 2.6847e-01, -1.9958e-01,\n", " -1.3675e-01, -2.0460e-02, -9.8616e-03, 2.6683e-01, 1.5905e-01,\n", " 7.5346e-02, 3.2783e-01, 4.0557e-01, 7.0211e-02, 8.3440e-02,\n", " -2.9874e-02, -5.6754e-03, -4.0466e-01, -1.0981e-02, -1.8448e-01,\n", " -2.0213e-01, 1.0912e-01, 2.2499e-01, -1.6820e-02, -2.3222e-01,\n", " -1.6001e-01, 1.9098e-01, -1.3378e-01, 1.9698e-01, 2.2365e-01,\n", " -1.8971e-02, -6.6757e-02, 1.2196e-01, -2.4815e-02, -1.4876e-03,\n", " -5.3409e-02, 4.1121e-02, -4.9759e-02, -1.6394e-01, -3.4938e-01,\n", " 2.2144e-01, 9.8808e-02, -7.8104e-02, 1.4979e-01, 1.2916e-01,\n", " -3.8191e-02, -2.3274e-01, 3.3678e-01, -1.5813e-01, -2.8666e-01,\n", " -6.5294e-02, -1.8640e-01, -1.8699e-01, -2.1927e-01, 3.7397e-01,\n", " -3.2194e-01, 1.0670e-01, -5.7381e-02, -6.8237e-03, -8.3549e-02,\n", " 1.1985e-01, -2.0708e-02, 1.0392e-01, 1.4207e-01, 1.2231e-01,\n", " -3.3227e-02, 7.1793e-02, 8.2369e-02, 1.8579e-02, 9.3190e-02,\n", " 4.0782e-02, 2.2149e-01, 1.7831e-01, -1.9339e-03, -2.5347e-01,\n", " 1.7894e-01, 5.1940e-02, 1.7891e-02, -2.3192e-01, 2.1578e-01,\n", " 3.0329e-01, 4.5969e-02, -2.7595e-01, 8.8878e-02, -2.3627e-01,\n", " -8.3862e-02, -8.7476e-02]], grad_fn=)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pred_outputs.audio_embeds" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "False\n" ] } ], "source": [ "import torch\n", "audio_embeds = pred_outputs.audio_embeds\n", "print(torch.equal(outputs.audio_embeds, audio_embeds))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at laion/clap-htsat-unfused were not used when initializing ClapTextModelWithProjection: ['audio_model.audio_encoder.layers.2.blocks.2.attention.self.key.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.2.layernorm_after.weight', 'audio_model.audio_encoder.layers.0.blocks.1.layernorm_before.bias', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.value.weight', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.1.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.value.bias', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.0.blocks.0.layernorm_after.weight', 'audio_model.audio_encoder.layers.0.blocks.1.layernorm_after.bias', 'audio_model.audio_encoder.layers.3.blocks.1.layernorm_after.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.query.bias', 'audio_model.audio_encoder.layers.1.blocks.0.intermediate.dense.weight', 'audio_projection.linear1.bias', 'audio_model.audio_encoder.layers.2.blocks.0.layernorm_before.bias', 'audio_projection.linear2.weight', 'audio_model.audio_encoder.layers.0.downsample.norm.bias', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.5.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.2.blocks.3.layernorm_before.weight', 'audio_model.audio_encoder.layers.3.blocks.1.intermediate.dense.weight', 'audio_model.audio_encoder.layers.1.downsample.norm.bias', 'audio_model.audio_encoder.layers.1.blocks.1.intermediate.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.0.intermediate.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.value.bias', 'audio_model.audio_encoder.layers.2.blocks.5.layernorm_before.weight', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.value.weight', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.value.bias', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.query.bias', 'audio_model.audio_encoder.layers.1.blocks.1.layernorm_before.weight', 'audio_model.audio_encoder.layers.2.blocks.4.intermediate.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.0.output.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.value.bias', 'audio_model.audio_encoder.layers.3.blocks.1.layernorm_before.bias', 'audio_model.audio_encoder.layers.2.blocks.5.attention.output.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.1.output.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.query.weight', 'audio_model.audio_encoder.layers.1.blocks.0.layernorm_before.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.2.blocks.1.attention.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.key.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.key.bias', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.2.blocks.0.attention.output.dense.bias', 'audio_model.audio_encoder.batch_norm.running_var', 'audio_model.audio_encoder.layers.3.blocks.1.intermediate.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.0.intermediate.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.query.bias', 'audio_model.audio_encoder.patch_embed.proj.weight', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.5.layernorm_before.bias', 'audio_model.audio_encoder.layers.1.blocks.1.layernorm_after.weight', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.value.bias', 'audio_model.audio_encoder.layers.3.blocks.1.attention.output.dense.bias', 'audio_projection.linear1.weight', 'audio_model.audio_encoder.layers.2.blocks.0.layernorm_after.bias', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.3.blocks.1.attention.output.dense.weight', 'audio_model.audio_encoder.layers.0.blocks.0.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.key.bias', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.4.attention.output.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.value.bias', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.key.bias', 'audio_model.audio_encoder.layers.1.blocks.1.layernorm_before.bias', 'audio_model.audio_encoder.layers.2.blocks.0.layernorm_after.weight', 'audio_model.audio_encoder.layers.1.blocks.1.intermediate.dense.weight', 'audio_model.audio_encoder.norm.bias', 'audio_model.audio_encoder.layers.2.blocks.0.intermediate.dense.weight', 'audio_model.audio_encoder.layers.3.blocks.0.attention.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.query.bias', 'audio_model.audio_encoder.layers.1.blocks.0.attention.output.dense.weight', 'audio_model.audio_encoder.layers.1.blocks.0.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.0.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.1.attention.output.dense.weight', 'audio_model.audio_encoder.layers.3.blocks.0.layernorm_after.weight', 'audio_model.audio_encoder.layers.2.downsample.reduction.weight', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.value.weight', 'audio_model.audio_encoder.batch_norm.bias', 'audio_model.audio_encoder.layers.1.blocks.0.attention.output.dense.bias', 'audio_model.audio_encoder.batch_norm.weight', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.1.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.4.attention.output.dense.weight', 'audio_model.audio_encoder.layers.1.downsample.reduction.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.value.bias', 'audio_model.audio_encoder.batch_norm.num_batches_tracked', 'audio_model.audio_encoder.layers.0.blocks.0.attention.output.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.1.output.dense.weight', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.3.intermediate.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.3.layernorm_after.bias', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.query.weight', 'audio_model.audio_encoder.layers.0.blocks.1.layernorm_after.weight', 'audio_model.audio_encoder.layers.1.blocks.0.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.3.attention.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.4.output.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.1.layernorm_after.bias', 'audio_model.audio_encoder.patch_embed.proj.bias', 'audio_model.audio_encoder.layers.3.blocks.0.output.dense.weight', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.value.bias', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.3.blocks.0.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.4.layernorm_after.bias', 'audio_model.audio_encoder.layers.0.blocks.1.intermediate.dense.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.key.bias', 'audio_model.audio_encoder.layers.2.blocks.5.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.key.bias', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.1.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.4.layernorm_before.weight', 'audio_model.audio_encoder.layers.2.blocks.4.layernorm_after.weight', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.1.intermediate.dense.weight', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.key.bias', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.query.weight', 'audio_model.audio_encoder.layers.3.blocks.0.attention.output.dense.weight', 'audio_model.audio_encoder.layers.3.blocks.0.layernorm_before.weight', 'audio_model.audio_encoder.layers.2.blocks.2.attention.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.5.attention.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.2.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.3.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.query.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.value.bias', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.key.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.key.weight', 'audio_model.audio_encoder.layers.1.blocks.0.layernorm_before.bias', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.key.weight', 'audio_model.audio_encoder.layers.3.blocks.0.layernorm_before.bias', 'audio_model.audio_encoder.layers.0.downsample.norm.weight', 'audio_model.audio_encoder.layers.2.blocks.3.attention.output.dense.weight', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.key.bias', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.2.blocks.1.intermediate.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.2.blocks.1.layernorm_before.weight', 'audio_model.audio_encoder.layers.2.blocks.0.output.dense.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.2.blocks.2.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.2.layernorm_before.weight', 'audio_model.audio_encoder.layers.2.blocks.3.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.3.intermediate.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.4.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.value.bias', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.5.intermediate.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.query.weight', 'audio_model.audio_encoder.layers.0.blocks.0.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.3.layernorm_after.weight', 'audio_model.audio_encoder.layers.2.blocks.5.output.dense.bias', 'audio_model.audio_encoder.layers.3.blocks.1.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.2.layernorm_before.bias', 'audio_model.audio_encoder.layers.2.blocks.4.layernorm_before.bias', 'audio_model.audio_encoder.layers.1.downsample.norm.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.5.intermediate.dense.weight', 'audio_model.audio_encoder.layers.2.downsample.norm.weight', 'audio_model.audio_encoder.layers.2.blocks.0.intermediate.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.0.output.dense.weight', 'logit_scale_t', 'audio_model.audio_encoder.layers.2.blocks.5.layernorm_after.weight', 'audio_model.audio_encoder.layers.2.downsample.norm.bias', 'audio_model.audio_encoder.layers.3.blocks.1.output.dense.weight', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.query.bias', 'audio_model.audio_encoder.layers.0.blocks.0.attention.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.3.layernorm_before.bias', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.value.weight', 'logit_scale_a', 'audio_model.audio_encoder.layers.0.blocks.0.layernorm_before.weight', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.value.bias', 'audio_model.audio_encoder.layers.2.blocks.1.layernorm_before.bias', 'audio_model.audio_encoder.layers.2.blocks.1.layernorm_after.weight', 'audio_model.audio_encoder.layers.0.blocks.1.intermediate.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.1.layernorm_before.weight', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.key.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.2.attention.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.relative_position_index', 'audio_projection.linear2.bias', 'audio_model.audio_encoder.layers.1.blocks.1.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.2.intermediate.dense.weight', 'audio_model.audio_encoder.layers.0.blocks.1.attention.output.dense.weight', 'audio_model.audio_encoder.patch_embed.norm.weight', 'audio_model.audio_encoder.layers.3.blocks.1.attention.self.query.weight', 'audio_model.audio_encoder.batch_norm.running_mean', 'audio_model.audio_encoder.norm.weight', 'audio_model.audio_encoder.layers.3.blocks.0.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.0.blocks.0.layernorm_before.bias', 'audio_model.audio_encoder.layers.2.blocks.4.intermediate.dense.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.query.bias', 'audio_model.audio_encoder.layers.2.blocks.2.output.dense.bias', 'audio_model.audio_encoder.layers.2.blocks.2.intermediate.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.0.intermediate.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.key.bias', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.key.weight', 'audio_model.audio_encoder.layers.3.blocks.1.layernorm_before.weight', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.key.bias', 'audio_model.audio_encoder.layers.1.blocks.0.output.dense.weight', 'audio_model.audio_encoder.layers.1.blocks.1.attention.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.key.bias', 'audio_model.audio_encoder.layers.2.blocks.5.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.query.bias', 'audio_model.audio_encoder.layers.0.downsample.reduction.weight', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.value.bias', 'audio_model.audio_encoder.patch_embed.norm.bias', 'audio_model.audio_encoder.layers.1.blocks.0.layernorm_after.weight', 'audio_model.audio_encoder.layers.0.blocks.0.intermediate.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.3.attention.self.key.bias', 'audio_model.audio_encoder.layers.2.blocks.0.attention.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.key.bias', 'audio_model.audio_encoder.layers.0.blocks.1.attention.output.dense.bias', 'audio_model.audio_encoder.layers.0.blocks.1.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.0.layernorm_before.weight', 'audio_model.audio_encoder.layers.2.blocks.1.attention.self.query.weight', 'audio_model.audio_encoder.layers.1.blocks.1.output.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.0.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.3.blocks.0.intermediate.dense.bias']\n", "- This IS expected if you are initializing ClapTextModelWithProjection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing ClapTextModelWithProjection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] } ], "source": [ "from transformers import ClapTextModelWithProjection\n", "text_model = ClapTextModelWithProjection.from_pretrained(\"laion/clap-htsat-unfused\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "text_inputs = processor(text=input_text, return_tensors='pt', padding=True)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "text_embeds = text_model(**text_inputs).text_embeds" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "audio_embeds = audio_embeds / audio_embeds.norm(p=2, dim=-1, keepdim=True)\n", "text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "logits_per_text = torch.matmul(text_embeds, audio_embeds.t()) * model.logit_scale_t.exp()\n", "logits_per_audio = torch.matmul(audio_embeds, text_embeds.t()) * model.logit_scale_a.exp()\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[0.0008, 0.1800, 0.8192]], grad_fn=)\n" ] } ], "source": [ "logits_per_audio\n", "probs = logits_per_audio.softmax(dim=-1)\n", "print(probs)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Hospital elevator', 'Water Drop', 'Sound of water dropping']\n" ] } ], "source": [ "print(input_text)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[-1.8583e-02, 5.5992e-02, 8.3854e-02, 5.4465e-02, 7.6927e-03,\n", " -3.1491e-02, -8.9635e-02, -1.2078e-02, -4.4169e-04, 3.3850e-02,\n", " 3.6257e-02, -1.2702e-03, 1.4614e-02, 2.6491e-02, -3.1281e-02,\n", " -1.7491e-02, 2.2185e-02, 1.9653e-02, 3.9481e-02, 9.2402e-02,\n", " 8.8703e-02, 9.6147e-02, -5.2913e-02, 6.1069e-02, 1.7323e-02,\n", " 1.5685e-02, -1.0988e-02, 1.5493e-02, 1.8560e-02, -1.4622e-02,\n", " -7.8600e-03, -2.6038e-03, 6.7897e-02, -2.5596e-02, -1.4585e-02,\n", " -9.5110e-03, 8.4926e-03, 7.4816e-03, -1.0615e-02, -7.3421e-02,\n", " 4.2511e-02, -4.2928e-02, -8.7179e-02, -5.5344e-02, 1.3409e-02,\n", " -5.6063e-05, -3.7629e-02, -4.3867e-02, 9.1751e-02, -6.4542e-02,\n", " 5.4798e-02, 1.2325e-02, 2.6350e-02, 2.6276e-02, -2.2255e-02,\n", " 2.8848e-02, 7.6467e-02, -8.0706e-03, 7.6788e-02, 3.0469e-02,\n", " 3.7577e-02, -5.2791e-02, -3.5234e-02, -8.1808e-02, 5.5592e-02,\n", " -3.3154e-02, 9.4394e-03, 8.4927e-03, -4.2966e-02, -9.7140e-03,\n", " -2.4706e-02, 1.9901e-02, 3.8590e-02, 1.7017e-02, 5.6728e-02,\n", " 1.7885e-02, 8.5008e-03, -6.5625e-03, 5.7406e-02, 1.5759e-02,\n", " 6.1186e-03, -1.2140e-02, -2.3095e-02, -6.6678e-03, -4.5153e-02,\n", " 5.2661e-02, -2.5516e-02, -5.3425e-02, -2.3067e-02, -2.3754e-02,\n", " -5.4871e-02, -6.4729e-02, -1.1595e-02, 6.2536e-03, 4.6794e-02,\n", " -1.6047e-02, -2.0909e-02, 1.6630e-02, -5.2005e-02, 4.0094e-02,\n", " -1.3699e-02, 7.2281e-02, -3.9658e-02, -1.2342e-02, -3.9841e-02,\n", " 5.5138e-02, 3.1759e-02, -2.6376e-02, 6.8731e-04, 7.8212e-03,\n", " -7.8155e-03, -3.2266e-02, 3.6796e-02, 6.6807e-03, -2.0357e-02,\n", " 5.8145e-03, 1.1388e-01, 1.4180e-02, 1.8060e-02, 2.3859e-03,\n", " -1.5114e-02, 9.1754e-03, -6.1615e-02, -4.4413e-02, -1.8781e-02,\n", " -4.1286e-02, -5.1692e-02, 2.8523e-02, 4.2640e-03, -4.8323e-02,\n", " 1.8970e-02, 7.8768e-02, 4.7860e-02, -2.0071e-03, -2.4511e-02,\n", " -8.9261e-02, 1.9435e-02, 1.8447e-02, -3.3635e-02, 3.5379e-02,\n", " 1.3189e-02, -1.0860e-01, 7.5787e-03, -2.7167e-02, 2.1219e-02,\n", " -7.0217e-03, 5.6890e-03, 4.6970e-02, 4.2286e-02, 5.5763e-03,\n", " -2.1422e-03, -2.4775e-02, 2.9822e-02, 3.2551e-02, -8.8373e-03,\n", " 1.5043e-02, 4.9598e-03, 6.0621e-02, 3.3781e-05, -7.2254e-02,\n", " -2.9117e-03, 7.0287e-02, 3.0504e-03, -3.9667e-02, 9.7839e-02,\n", " -3.1829e-02, -4.3577e-02, 1.6052e-02, 5.2850e-05, 3.1529e-02,\n", " 4.3340e-02, 1.7142e-02, -8.4809e-02, -4.0681e-02, -2.8142e-02,\n", " -8.5958e-02, 3.2428e-03, -1.9032e-02, 5.8029e-02, -2.4406e-02,\n", " -3.1245e-02, 2.3364e-02, 6.9791e-03, 1.4661e-02, -1.3768e-02,\n", " -5.1513e-02, -3.3744e-02, 2.1782e-02, 2.5502e-02, 5.3557e-02,\n", " -2.3145e-03, 5.3144e-02, -1.6371e-02, 5.5036e-03, -6.3860e-02,\n", " 4.8016e-02, -4.4548e-03, -1.8589e-03, -2.4433e-02, 5.2190e-02,\n", " 1.1150e-02, -3.7524e-02, -5.2418e-02, -9.9722e-02, 2.6107e-02,\n", " -4.6289e-02, 6.1242e-03, 2.5964e-02, 2.1980e-02, 1.5163e-02,\n", " 2.5706e-02, 7.2124e-02, 1.4745e-02, -7.1983e-02, 6.2864e-02,\n", " 1.1330e-01, 7.5015e-02, 4.9956e-02, 1.7700e-03, 1.6596e-02,\n", " -1.6827e-02, -7.3457e-02, -3.3165e-03, -8.4632e-03, 2.7623e-02,\n", " 1.0017e-01, -1.1587e-02, 1.0287e-01, -4.9033e-02, 9.5033e-03,\n", " -9.4450e-02, 6.1287e-02, -5.5930e-03, 3.0166e-02, -2.5611e-02,\n", " 5.7977e-02, 9.5688e-02, -3.2020e-02, -1.7179e-02, -6.9293e-02,\n", " 5.1499e-02, 4.1328e-02, 1.5628e-02, -6.6505e-02, -3.4269e-02,\n", " 1.8429e-02, -2.8803e-02, 1.5314e-02, -5.2851e-03, 1.2170e-03,\n", " -1.0516e-01, 4.3287e-02, 2.5964e-03, -4.5398e-02, 3.4208e-02,\n", " -2.1013e-02, -1.3280e-02, 3.9861e-02, -4.7025e-02, 3.8722e-02,\n", " 2.1449e-02, -4.5475e-02, -5.6666e-02, 4.4490e-02, 7.2717e-03,\n", " 1.4785e-02, -1.5225e-03, 7.3994e-04, 1.1020e-01, -9.0732e-02,\n", " -5.9094e-02, -2.2549e-02, -8.2801e-02, -5.1345e-02, -1.9502e-02,\n", " 8.4704e-03, -4.6431e-02, 2.2088e-04, 3.4909e-02, -4.4079e-02,\n", " -9.3681e-02, 3.6747e-02, 9.4546e-02, 7.6373e-03, 8.2912e-03,\n", " -7.2058e-02, 3.4145e-03, 6.2346e-02, 1.2962e-03, 4.0038e-02,\n", " 3.3616e-02, 1.6410e-03, 7.7283e-02, 1.8166e-03, 7.4259e-04,\n", " 9.2126e-02, 6.1755e-02, -5.9717e-03, 5.3494e-02, -1.7252e-02,\n", " -2.5797e-02, -1.0180e-01, -2.8646e-02, 1.9680e-02, -7.4691e-02,\n", " 4.4272e-02, 2.1543e-02, 4.8863e-02, 2.1017e-02, -2.6691e-02,\n", " -3.4846e-02, 5.9396e-02, -5.2638e-02, 5.6193e-02, 8.8406e-02,\n", " 4.6701e-02, -8.8615e-03, -2.6312e-02, -1.8872e-02, 3.0053e-02,\n", " -2.9942e-02, -1.1602e-02, 1.7948e-02, -1.1011e-03, 5.7253e-03,\n", " 1.7541e-02, 4.3241e-02, -2.3277e-02, -1.3823e-03, 7.7224e-02,\n", " -8.7057e-03, -4.2007e-03, -4.9542e-02, -1.0581e-01, 1.7703e-02,\n", " 5.4625e-03, -8.4712e-02, 1.9230e-02, -7.0548e-03, 2.6280e-02,\n", " -3.2257e-03, 9.3422e-02, -4.3778e-02, -5.8439e-02, 5.3234e-02,\n", " 9.0316e-03, -1.2180e-01, -1.9662e-02, -4.4428e-02, -5.4394e-03,\n", " -1.4109e-02, -3.7352e-02, 4.3117e-02, -1.6872e-02, -8.0977e-02,\n", " 4.5958e-02, -4.4673e-02, 2.6929e-02, -5.2830e-02, -3.8332e-02,\n", " 5.5667e-02, -5.0154e-02, -2.8463e-02, 4.8368e-02, -3.7541e-03,\n", " -3.1415e-02, 4.0765e-02, -8.7953e-02, 8.3991e-02, -1.0103e-01,\n", " -3.2731e-02, -2.8046e-02, -8.2369e-05, 4.4161e-02, -6.8035e-03,\n", " -1.1540e-02, 2.1856e-02, 2.1616e-02, 6.9975e-02, 7.4457e-02,\n", " -3.0301e-02, 6.7294e-02, 4.6710e-02, -2.1594e-03, 5.6077e-03,\n", " -1.7449e-02, 5.9909e-02, -3.4097e-02, 2.5859e-02, 8.3136e-02,\n", " -7.2779e-02, 8.3011e-02, 1.2604e-02, -8.4367e-03, -2.1625e-02,\n", " -2.1813e-02, -2.7091e-05, -7.2330e-03, -1.0099e-01, -1.0501e-02,\n", " 2.0341e-02, -1.3656e-02, -2.3832e-02, 7.5759e-03, -1.3211e-02,\n", " 6.8102e-02, 2.2829e-03, -1.6518e-02, 3.9951e-02, -5.7018e-02,\n", " 1.8331e-03, 5.8490e-02, 1.7594e-02, -3.7133e-02, -3.1605e-02,\n", " -2.3609e-02, 2.1070e-02, 7.9210e-02, 8.3406e-03, 2.4773e-03,\n", " -3.8526e-02, 7.4689e-03, -1.4063e-02, 8.1757e-03, 3.0088e-02,\n", " -1.7644e-03, 1.0350e-01, 5.1846e-02, 6.2803e-02, -4.6687e-02,\n", " -3.1990e-02, -4.7862e-03, -2.3069e-03, 6.2420e-02, 3.7207e-02,\n", " 1.7626e-02, 7.6689e-02, 9.4874e-02, 1.6424e-02, 1.9519e-02,\n", " -6.9884e-03, -1.3276e-03, -9.4661e-02, -2.5687e-03, -4.3154e-02,\n", " -4.7284e-02, 2.5526e-02, 5.2632e-02, -3.9346e-03, -5.4324e-02,\n", " -3.7432e-02, 4.4676e-02, -3.1294e-02, 4.6080e-02, 5.2318e-02,\n", " -4.4378e-03, -1.5616e-02, 2.8531e-02, -5.8050e-03, -3.4799e-04,\n", " -1.2494e-02, 9.6195e-03, -1.1640e-02, -3.8351e-02, -8.1731e-02,\n", " 5.1802e-02, 2.3114e-02, -1.8271e-02, 3.5041e-02, 3.0215e-02,\n", " -8.9341e-03, -5.4446e-02, 7.8782e-02, -3.6992e-02, -6.7059e-02,\n", " -1.5274e-02, -4.3605e-02, -4.3741e-02, -5.1293e-02, 8.7484e-02,\n", " -7.5312e-02, 2.4961e-02, -1.3423e-02, -1.5963e-03, -1.9545e-02,\n", " 2.8036e-02, -4.8443e-03, 2.4310e-02, 3.3234e-02, 2.8612e-02,\n", " -7.7728e-03, 1.6794e-02, 1.9268e-02, 4.3462e-03, 2.1800e-02,\n", " 9.5402e-03, 5.1813e-02, 4.1713e-02, -4.5239e-04, -5.9295e-02,\n", " 4.1860e-02, 1.2150e-02, 4.1853e-03, -5.4254e-02, 5.0478e-02,\n", " 7.0949e-02, 1.0754e-02, -6.4552e-02, 2.0791e-02, -5.5270e-02,\n", " -1.9618e-02, -2.0463e-02]], grad_fn=)\n" ] } ], "source": [ "print(audio_embeds)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "state_dict = audio_prediction_model.audio_projection.state_dict()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "state_dict2 = model.audio_projection.state_dict()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "True\n", "True\n", "True\n", "True\n" ] } ], "source": [ "for key in state_dict.keys():\n", " tensor1 = state_dict[key]\n", " tensor2 = state_dict2[key]\n", "\n", " print(torch.equal(tensor1, tensor2))" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }