Grounded-Segment-Anything / transformers_4_35_0 /models /audio_spectrogram_transformer /convert_audio_spectrogram_transformer_original_to_pytorch.py
liuyizhang
add transformers_4_35_0
1ce5e18
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Audio Spectrogram Transformer checkpoints from the original repository. URL: https://github.com/YuanGongND/ast"""
import argparse
import json
from pathlib import Path
import torch
import torchaudio
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_audio_spectrogram_transformer_config(model_name):
config = ASTConfig()
if "10-10" in model_name:
pass
elif "speech-commands" in model_name:
config.max_length = 128
elif "12-12" in model_name:
config.time_stride = 12
config.frequency_stride = 12
elif "14-14" in model_name:
config.time_stride = 14
config.frequency_stride = 14
elif "16-16" in model_name:
config.time_stride = 16
config.frequency_stride = 16
else:
raise ValueError("Model not supported")
repo_id = "huggingface/label-files"
if "speech-commands" in model_name:
config.num_labels = 35
filename = "speech-commands-v2-id2label.json"
else:
config.num_labels = 527
filename = "audioset-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
def rename_key(name):
if "module.v" in name:
name = name.replace("module.v", "audio_spectrogram_transformer")
if "cls_token" in name:
name = name.replace("cls_token", "embeddings.cls_token")
if "dist_token" in name:
name = name.replace("dist_token", "embeddings.distillation_token")
if "pos_embed" in name:
name = name.replace("pos_embed", "embeddings.position_embeddings")
if "patch_embed.proj" in name:
name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
# transformer blocks
if "blocks" in name:
name = name.replace("blocks", "encoder.layer")
if "attn.proj" in name:
name = name.replace("attn.proj", "attention.output.dense")
if "attn" in name:
name = name.replace("attn", "attention.self")
if "norm1" in name:
name = name.replace("norm1", "layernorm_before")
if "norm2" in name:
name = name.replace("norm2", "layernorm_after")
if "mlp.fc1" in name:
name = name.replace("mlp.fc1", "intermediate.dense")
if "mlp.fc2" in name:
name = name.replace("mlp.fc2", "output.dense")
# final layernorm
if "audio_spectrogram_transformer.norm" in name:
name = name.replace("audio_spectrogram_transformer.norm", "audio_spectrogram_transformer.layernorm")
# classifier head
if "module.mlp_head.0" in name:
name = name.replace("module.mlp_head.0", "classifier.layernorm")
if "module.mlp_head.1" in name:
name = name.replace("module.mlp_head.1", "classifier.dense")
return name
def convert_state_dict(orig_state_dict, config):
for key in orig_state_dict.copy().keys():
val = orig_state_dict.pop(key)
if "qkv" in key:
key_split = key.split(".")
layer_num = int(key_split[3])
dim = config.hidden_size
if "weight" in key:
orig_state_dict[
f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight"
] = val[:dim, :]
orig_state_dict[
f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight"
] = val[dim : dim * 2, :]
orig_state_dict[
f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight"
] = val[-dim:, :]
else:
orig_state_dict[
f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias"
] = val[:dim]
orig_state_dict[
f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias"
] = val[dim : dim * 2]
orig_state_dict[
f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias"
] = val[-dim:]
else:
orig_state_dict[rename_key(key)] = val
return orig_state_dict
def remove_keys(state_dict):
ignore_keys = [
"module.v.head.weight",
"module.v.head.bias",
"module.v.head_dist.weight",
"module.v.head_dist.bias",
]
for k in ignore_keys:
state_dict.pop(k, None)
@torch.no_grad()
def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
"""
Copy/paste/tweak model's weights to our Audio Spectrogram Transformer structure.
"""
config = get_audio_spectrogram_transformer_config(model_name)
model_name_to_url = {
"ast-finetuned-audioset-10-10-0.4593": (
"https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1"
),
"ast-finetuned-audioset-10-10-0.450": (
"https://www.dropbox.com/s/1tv0hovue1bxupk/audioset_10_10_0.4495.pth?dl=1"
),
"ast-finetuned-audioset-10-10-0.448": (
"https://www.dropbox.com/s/6u5sikl4b9wo4u5/audioset_10_10_0.4483.pth?dl=1"
),
"ast-finetuned-audioset-10-10-0.448-v2": (
"https://www.dropbox.com/s/kt6i0v9fvfm1mbq/audioset_10_10_0.4475.pth?dl=1"
),
"ast-finetuned-audioset-12-12-0.447": (
"https://www.dropbox.com/s/snfhx3tizr4nuc8/audioset_12_12_0.4467.pth?dl=1"
),
"ast-finetuned-audioset-14-14-0.443": (
"https://www.dropbox.com/s/z18s6pemtnxm4k7/audioset_14_14_0.4431.pth?dl=1"
),
"ast-finetuned-audioset-16-16-0.442": (
"https://www.dropbox.com/s/mdsa4t1xmcimia6/audioset_16_16_0.4422.pth?dl=1"
),
"ast-finetuned-speech-commands-v2": (
"https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1"
),
}
# load original state_dict
checkpoint_url = model_name_to_url[model_name]
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
# remove some keys
remove_keys(state_dict)
# rename some keys
new_state_dict = convert_state_dict(state_dict, config)
# load πŸ€— model
model = ASTForAudioClassification(config)
model.eval()
model.load_state_dict(new_state_dict)
# verify outputs on dummy input
# source: https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62
mean = -4.2677393 if "speech-commands" not in model_name else -6.845978
std = 4.5689974 if "speech-commands" not in model_name else 5.5654526
max_length = 1024 if "speech-commands" not in model_name else 128
feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length)
if "speech-commands" in model_name:
dataset = load_dataset("speech_commands", "v0.02", split="validation")
waveform = dataset[0]["audio"]["array"]
else:
filepath = hf_hub_download(
repo_id="nielsr/audio-spectogram-transformer-checkpoint",
filename="sample_audio.flac",
repo_type="dataset",
)
waveform, _ = torchaudio.load(filepath)
waveform = waveform.squeeze().numpy()
inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt")
# forward pass
outputs = model(**inputs)
logits = outputs.logits
if model_name == "ast-finetuned-audioset-10-10-0.4593":
expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
elif model_name == "ast-finetuned-audioset-10-10-0.450":
expected_slice = torch.tensor([-1.1986, -7.0903, -8.2718])
elif model_name == "ast-finetuned-audioset-10-10-0.448":
expected_slice = torch.tensor([-2.6128, -8.0080, -9.4344])
elif model_name == "ast-finetuned-audioset-10-10-0.448-v2":
expected_slice = torch.tensor([-1.5080, -7.4534, -8.8917])
elif model_name == "ast-finetuned-audioset-12-12-0.447":
expected_slice = torch.tensor([-0.5050, -6.5833, -8.0843])
elif model_name == "ast-finetuned-audioset-14-14-0.443":
expected_slice = torch.tensor([-0.3826, -7.0336, -8.2413])
elif model_name == "ast-finetuned-audioset-16-16-0.442":
expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470])
elif model_name == "ast-finetuned-speech-commands-v2":
expected_slice = torch.tensor([6.1589, -8.0566, -8.7984])
else:
raise ValueError("Unknown model name")
if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4):
raise ValueError("Logits don't match")
print("Looks ok!")
if pytorch_dump_folder_path is not None:
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving feature extractor to {pytorch_dump_folder_path}")
feature_extractor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing model and feature extractor to the hub...")
model.push_to_hub(f"MIT/{model_name}")
feature_extractor.push_to_hub(f"MIT/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--model_name",
default="ast-finetuned-audioset-10-10-0.4593",
type=str,
help="Name of the Audio Spectrogram Transformer model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the πŸ€— hub."
)
args = parser.parse_args()
convert_audio_spectrogram_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)