Spaces:

yizhangliu
/

Grounded-Segment-Anything

Runtime error

App Files Files Community

Grounded-Segment-Anything / transformers_4_35_0 /models /audio_spectrogram_transformer /convert_audio_spectrogram_transformer_original_to_pytorch.py

liuyizhang

add transformers_4_35_0

1ce5e18 over 1 year ago

raw

history blame contribute delete

11.1 kB

	# coding=utf-8
	# Copyright 2022 The HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Convert Audio Spectrogram Transformer checkpoints from the original repository. URL: https://github.com/YuanGongND/ast"""


	import argparse
	import json
	from pathlib import Path

	import torch
	import torchaudio
	from datasets import load_dataset
	from huggingface_hub import hf_hub_download

	from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification
	from transformers.utils import logging


	logging.set_verbosity_info()
	logger = logging.get_logger(__name__)


	def get_audio_spectrogram_transformer_config(model_name):
	config = ASTConfig()

	if "10-10" in model_name:
	pass
	elif "speech-commands" in model_name:
	config.max_length = 128
	elif "12-12" in model_name:
	config.time_stride = 12
	config.frequency_stride = 12
	elif "14-14" in model_name:
	config.time_stride = 14
	config.frequency_stride = 14
	elif "16-16" in model_name:
	config.time_stride = 16
	config.frequency_stride = 16
	else:
	raise ValueError("Model not supported")

	repo_id = "huggingface/label-files"
	if "speech-commands" in model_name:
	config.num_labels = 35
	filename = "speech-commands-v2-id2label.json"
	else:
	config.num_labels = 527
	filename = "audioset-id2label.json"

	id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
	id2label = {int(k): v for k, v in id2label.items()}
	config.id2label = id2label
	config.label2id = {v: k for k, v in id2label.items()}

	return config


	def rename_key(name):
	if "module.v" in name:
	name = name.replace("module.v", "audio_spectrogram_transformer")
	if "cls_token" in name:
	name = name.replace("cls_token", "embeddings.cls_token")
	if "dist_token" in name:
	name = name.replace("dist_token", "embeddings.distillation_token")
	if "pos_embed" in name:
	name = name.replace("pos_embed", "embeddings.position_embeddings")
	if "patch_embed.proj" in name:
	name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
	# transformer blocks
	if "blocks" in name:
	name = name.replace("blocks", "encoder.layer")
	if "attn.proj" in name:
	name = name.replace("attn.proj", "attention.output.dense")
	if "attn" in name:
	name = name.replace("attn", "attention.self")
	if "norm1" in name:
	name = name.replace("norm1", "layernorm_before")
	if "norm2" in name:
	name = name.replace("norm2", "layernorm_after")
	if "mlp.fc1" in name:
	name = name.replace("mlp.fc1", "intermediate.dense")
	if "mlp.fc2" in name:
	name = name.replace("mlp.fc2", "output.dense")
	# final layernorm
	if "audio_spectrogram_transformer.norm" in name:
	name = name.replace("audio_spectrogram_transformer.norm", "audio_spectrogram_transformer.layernorm")
	# classifier head
	if "module.mlp_head.0" in name:
	name = name.replace("module.mlp_head.0", "classifier.layernorm")
	if "module.mlp_head.1" in name:
	name = name.replace("module.mlp_head.1", "classifier.dense")

	return name


	def convert_state_dict(orig_state_dict, config):
	for key in orig_state_dict.copy().keys():
	val = orig_state_dict.pop(key)

	if "qkv" in key:
	key_split = key.split(".")
	layer_num = int(key_split[3])
	dim = config.hidden_size
	if "weight" in key:
	orig_state_dict[
	f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight"
	] = val[:dim, :]
	orig_state_dict[
	f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight"
	] = val[dim : dim * 2, :]
	orig_state_dict[
	f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight"
	] = val[-dim:, :]
	else:
	orig_state_dict[
	f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias"
	] = val[:dim]
	orig_state_dict[
	f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias"
	] = val[dim : dim * 2]
	orig_state_dict[
	f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias"
	] = val[-dim:]
	else:
	orig_state_dict[rename_key(key)] = val

	return orig_state_dict


	def remove_keys(state_dict):
	ignore_keys = [
	"module.v.head.weight",
	"module.v.head.bias",
	"module.v.head_dist.weight",
	"module.v.head_dist.bias",
	]
	for k in ignore_keys:
	state_dict.pop(k, None)


	@torch.no_grad()
	def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
	"""
	Copy/paste/tweak model's weights to our Audio Spectrogram Transformer structure.
	"""
	config = get_audio_spectrogram_transformer_config(model_name)

	model_name_to_url = {
	"ast-finetuned-audioset-10-10-0.4593": (
	"https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1"
	),
	"ast-finetuned-audioset-10-10-0.450": (
	"https://www.dropbox.com/s/1tv0hovue1bxupk/audioset_10_10_0.4495.pth?dl=1"
	),
	"ast-finetuned-audioset-10-10-0.448": (
	"https://www.dropbox.com/s/6u5sikl4b9wo4u5/audioset_10_10_0.4483.pth?dl=1"
	),
	"ast-finetuned-audioset-10-10-0.448-v2": (
	"https://www.dropbox.com/s/kt6i0v9fvfm1mbq/audioset_10_10_0.4475.pth?dl=1"
	),
	"ast-finetuned-audioset-12-12-0.447": (
	"https://www.dropbox.com/s/snfhx3tizr4nuc8/audioset_12_12_0.4467.pth?dl=1"
	),
	"ast-finetuned-audioset-14-14-0.443": (
	"https://www.dropbox.com/s/z18s6pemtnxm4k7/audioset_14_14_0.4431.pth?dl=1"
	),
	"ast-finetuned-audioset-16-16-0.442": (
	"https://www.dropbox.com/s/mdsa4t1xmcimia6/audioset_16_16_0.4422.pth?dl=1"
	),
	"ast-finetuned-speech-commands-v2": (
	"https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1"
	),
	}

	# load original state_dict
	checkpoint_url = model_name_to_url[model_name]
	state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
	# remove some keys
	remove_keys(state_dict)
	# rename some keys
	new_state_dict = convert_state_dict(state_dict, config)

	# load 🤗 model
	model = ASTForAudioClassification(config)
	model.eval()

	model.load_state_dict(new_state_dict)

	# verify outputs on dummy input
	# source: https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62
	mean = -4.2677393 if "speech-commands" not in model_name else -6.845978
	std = 4.5689974 if "speech-commands" not in model_name else 5.5654526
	max_length = 1024 if "speech-commands" not in model_name else 128
	feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length)

	if "speech-commands" in model_name:
	dataset = load_dataset("speech_commands", "v0.02", split="validation")
	waveform = dataset[0]["audio"]["array"]
	else:
	filepath = hf_hub_download(
	repo_id="nielsr/audio-spectogram-transformer-checkpoint",
	filename="sample_audio.flac",
	repo_type="dataset",
	)

	waveform, _ = torchaudio.load(filepath)
	waveform = waveform.squeeze().numpy()

	inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt")

	# forward pass
	outputs = model(**inputs)
	logits = outputs.logits

	if model_name == "ast-finetuned-audioset-10-10-0.4593":
	expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
	elif model_name == "ast-finetuned-audioset-10-10-0.450":
	expected_slice = torch.tensor([-1.1986, -7.0903, -8.2718])
	elif model_name == "ast-finetuned-audioset-10-10-0.448":
	expected_slice = torch.tensor([-2.6128, -8.0080, -9.4344])
	elif model_name == "ast-finetuned-audioset-10-10-0.448-v2":
	expected_slice = torch.tensor([-1.5080, -7.4534, -8.8917])
	elif model_name == "ast-finetuned-audioset-12-12-0.447":
	expected_slice = torch.tensor([-0.5050, -6.5833, -8.0843])
	elif model_name == "ast-finetuned-audioset-14-14-0.443":
	expected_slice = torch.tensor([-0.3826, -7.0336, -8.2413])
	elif model_name == "ast-finetuned-audioset-16-16-0.442":
	expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470])
	elif model_name == "ast-finetuned-speech-commands-v2":
	expected_slice = torch.tensor([6.1589, -8.0566, -8.7984])
	else:
	raise ValueError("Unknown model name")
	if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4):
	raise ValueError("Logits don't match")
	print("Looks ok!")

	if pytorch_dump_folder_path is not None:
	Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
	print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
	model.save_pretrained(pytorch_dump_folder_path)
	print(f"Saving feature extractor to {pytorch_dump_folder_path}")
	feature_extractor.save_pretrained(pytorch_dump_folder_path)

	if push_to_hub:
	print("Pushing model and feature extractor to the hub...")
	model.push_to_hub(f"MIT/{model_name}")
	feature_extractor.push_to_hub(f"MIT/{model_name}")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	# Required parameters
	parser.add_argument(
	"--model_name",
	default="ast-finetuned-audioset-10-10-0.4593",
	type=str,
	help="Name of the Audio Spectrogram Transformer model you'd like to convert.",
	)
	parser.add_argument(
	"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
	)
	parser.add_argument(
	"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
	)

	args = parser.parse_args()
	convert_audio_spectrogram_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)