# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This script generates a NeMo-Megatron compatible `.nemo` file for a Huggingface T5-v1_1 model. List of Huggingface models that this script can covert: 1. google/t5-v1_1-small 2. google/t5-v1_1-base 3. google/t5-v1_1-large 4. google/t5-v1_1-xl 5. google/t5-v1_1-xxl 6. google/mt5-small 7. google/mt5-base 8. google/mt5-large 9. google/mt5-xl 10. google/mt5-xxl 11. google/ul2 13. bigscience/T0pp 14. google/t5-small-lm-adapt 15. google/t5-base-lm-adapt 16. google/t5-large-lm-adapt 17. google/t5-xl-lm-adapt 18. google/t5-xxl-lm-adapt 19. google/flan-t5-small 20. google/flan-t5-base 21. google/flan-t5-large 22. google/flan-t5-xl 23. google/flan-t5-xxl Use instructions: python hf_t5-v1_1_to_nemo.py \ --hf_model_name bigscience/T0pp \ --nemo_state_dict /path/to/nemo_state_dict.pt \ --nemo_file_path /path/to/nemo_file.nemo """ import collections import os import tempfile from argparse import ArgumentParser import torch from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from transformers import AutoTokenizer, T5ForConditionalGeneration from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector try: import accelerate except ImportError: raise ImportError("Please install accelerate package via `pip install accelerate` to use this script.") def convert_weights(hf_model, nemo_state_dict_path): if hf_model == "google/ul2": torch_dtype = torch.bfloat16 else: torch_dtype = torch.float32 hf_model = T5ForConditionalGeneration.from_pretrained(hf_model, low_cpu_mem_usage=True, torch_dtype=torch_dtype) hf_model_config = hf_model.config with tempfile.TemporaryDirectory() as tmp: torch.save(hf_model.state_dict(), os.path.join(tmp, "model.pt")) hf_weights = torch.load(os.path.join(tmp, "model.pt")) nemo_weights = collections.OrderedDict() print(f"Found {len(hf_weights.keys())} keys in the checkpoint") def _get_model_type_block_layer(k): if k.startswith("encoder"): model_type = "encoder" elif k.startswith("decoder"): model_type = "decoder" else: raise ValueError(f"Unknown model type for {k}") return model_type, int(k.split(".")[2]), int(k.split(".")[4]) for k, v in hf_weights.items(): ################################################# ###### Enc-Dec Embeddings and Output Layer ###### ################################################# # Tied decoder embedding and decoder output layer. if k == "shared.weight": pass elif k == "lm_head.weight": nemo_weights["enc_dec_model.tokens_head.weight"] = v print( f"Mapped {k} to enc_dec_model.decoder_embedding.word_embeddings.weight and enc_dec_model.tokens_head.weight" ) # Decoder embeddings elif k == "decoder.embed_tokens.weight": nemo_weights["enc_dec_model.decoder_embedding.word_embeddings.weight"] = v elif k == "encoder.embed_tokens.weight": nemo_weights["enc_dec_model.encoder_embedding.word_embeddings.weight"] = v print(f"Mapped {k} to enc_dec_model.encoder_embedding.word_embeddings.weight") ################################################# ################# RPE Weights ################### ################################################# elif k == "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": nemo_weights["enc_dec_model.encoder_relative_position_embedding.relative_position_embedding.weight"] = v print( f"Mapped {k} to enc_dec_model.encoder_relative_position_embedding.relative_position_embedding.weight" ) elif k == "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": nemo_weights["enc_dec_model.decoder_relative_position_embedding.relative_position_embedding.weight"] = v print( f"Mapped {k} to enc_dec_model.decoder_relative_position_embedding.relative_position_embedding.weight" ) # Block in HF corresponds to layer in NeMo. # Layer in HF does not correspond to anything in NeMo. Layer 0 is self attn, layer 1 is cross-attn. ################################################# ############### Attention Layers ################ ################################################# # Self-Attention # Q, k, V in NeMo-Megatron is bundled into a single matrix. elif "SelfAttention.q.weight" in k: model_type, block_number, layer_number = _get_model_type_block_layer(k) k_weight = hf_weights[k.replace("q.weight", "k.weight")] v_weight = hf_weights[k.replace("q.weight", "v.weight")] concat_weights = torch.cat([v, k_weight, v_weight], dim=0) nemo_weights[ f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.self_attention.query_key_value.weight" ] = concat_weights print( f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.self_attention.query_key_value.weight" ) # We can skip processing of k, v weights since we already concat them into qkv above. elif "SelfAttention.k.weight" in k or "SelfAttention.v.weight" in k: pass # Output self-attn matrix. elif "SelfAttention.o.weight" in k: model_type, block_number, layer_number = _get_model_type_block_layer(k) block_number = int(k.split(".")[2]) # Block in HF corresponds to layer in NeMo. layer_number = int( k.split(".")[4] ) # Layer in HF does not correspond to anything in NeMo. Layer 0 is self attn, layer 1 is cross-attn. nemo_weights[ f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.self_attention.dense.weight" ] = v print( f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.self_attention.dense.weight" ) # Cross-Attention projection matrices are merged into K, V matrices in NeMo-Megatron elif "EncDecAttention.k.weight" in k: model_type, block_number, layer_number = _get_model_type_block_layer(k) v_weight = hf_weights[k.replace("k.weight", "v.weight")] concat_weights = torch.cat([v, v_weight], dim=0) nemo_weights[ f"enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.key_value.weight" ] = concat_weights print( f"Mapped {k} to enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.key_value.weight" ) # We can skip processing of v weights since we already concat them with k above. elif "EncDecAttention.v.weight" in k: pass # Cross-Attention Q matrix is separate in NeMo-Megatron elif "EncDecAttention.q.weight" in k: model_type, block_number, layer_number = _get_model_type_block_layer(k) nemo_weights[ f"enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.query.weight" ] = v print( f"Mapped {k} to enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.query.weight" ) # Cross-Attention Q matrix is separate in NeMo-Megatron elif "EncDecAttention.o.weight" in k: model_type, block_number, layer_number = _get_model_type_block_layer(k) nemo_weights[ f"enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.dense.weight" ] = v print( f"Mapped {k} to enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.dense.weight" ) ################################################# #################$ FFN Layers ################### ################################################# elif "DenseReluDense.wi_0.weight" in k: model_type, block_number, layer_number = _get_model_type_block_layer(k) nemo_weights[ f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_h_to_4h.weight" ] = v print( f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_h_to_4h.weight" ) elif "DenseReluDense.wi_1.weight" in k: model_type, block_number, layer_number = _get_model_type_block_layer(k) nemo_weights[ f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_h_to_4h_2.weight" ] = v print( f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_h_to_4h_2.weight" ) elif "DenseReluDense.wo.weight" in k: model_type, block_number, layer_number = _get_model_type_block_layer(k) nemo_weights[ f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_4h_to_h.weight" ] = v print( f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_4h_to_h.weight" ) ################################################# #################$ LayerNorm #################### ################################################# elif "layer_norm" in k: if "final" in k: model_type = "encoder" if k.startswith("encoder") else "decoder" nemo_weights[f"enc_dec_model.enc_dec_model.{model_type}.model.final_layernorm.weight"] = v print(f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.final_layernorm.weight") else: model_type, block_number, layer_number = _get_model_type_block_layer(k) if layer_number == 0 and model_type == "encoder": nemo_weights[ f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.input_layernorm.weight" ] = v print( f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.input_layernorm.weight" ) elif layer_number == 1 and model_type == "encoder": nemo_weights[ f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_attention_layernorm.weight" ] = v print( f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_attention_layernorm.weight" ) elif layer_number == 0 and model_type == "decoder": nemo_weights[ f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.input_layernorm.weight" ] = v print( f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.input_layernorm.weight" ) elif layer_number == 1 and model_type == "decoder": nemo_weights[ f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_attention_layernorm.weight" ] = v print( f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_attention_layernorm.weight" ) elif layer_number == 2 and model_type == "decoder": nemo_weights[ f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_inter_attention_layernorm.weight" ] = v print( f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_inter_attention_layernorm.weight" ) else: raise ValueError("Unknown layer_norm key: {}".format(k)) else: raise ValueError(f"Unknown key: {k}") torch.save(nemo_weights, nemo_state_dict_path) print("Saved weights to {}".format(nemo_state_dict_path)) return hf_model_config def package_into_nemo_file( state_dict_path, base_yaml_config, hf_model_config, nemo_file_path, hf_model_name, megatron_amp_O2 ): """ Packages the state dict, config file and tokenizer into a `.nemo` file. """ trainer = Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=32) base_cfg = OmegaConf.load(base_yaml_config) if hf_model_config.dense_act_fn == "silu": act_fn = "swiglu" elif hf_model_config.dense_act_fn == "gelu_new": act_fn = "geglu" # FLAN-T5 models have things configured this way. elif hf_model_config.dense_act_fn == "gelu" and hf_model_config.is_gated_act: act_fn = "geglu" else: raise ValueError(f"Unknown dense_act_fn: {hf_model_config.dense_act_fn}") with open_dict(base_cfg): base_cfg.encoder.num_layers = hf_model_config.num_layers base_cfg.encoder.hidden_size = hf_model_config.d_model base_cfg.encoder.ffn_hidden_size = hf_model_config.d_ff base_cfg.encoder.kv_channels = hf_model_config.d_kv base_cfg.encoder.num_attention_heads = hf_model_config.num_heads base_cfg.encoder.activation = act_fn base_cfg.encoder.relative_attention_num_buckets = hf_model_config.relative_attention_num_buckets base_cfg.decoder.num_layers = hf_model_config.num_decoder_layers base_cfg.decoder.hidden_size = hf_model_config.d_model base_cfg.decoder.ffn_hidden_size = hf_model_config.d_ff base_cfg.decoder.kv_channels = hf_model_config.d_kv base_cfg.decoder.num_attention_heads = hf_model_config.num_heads base_cfg.decoder.activation = act_fn base_cfg.decoder.relative_attention_num_buckets = hf_model_config.relative_attention_num_buckets base_cfg.megatron_amp_O2 = megatron_amp_O2 with tempfile.TemporaryDirectory() as tmp: tokenizer = AutoTokenizer.from_pretrained(hf_model_name) tokenizer_path = tokenizer.save_vocabulary(tmp)[0] base_cfg.tokenizer.model = tokenizer_path model = MegatronT5Model(base_cfg, trainer).to("cpu") model._save_restore_connector = NLPSaveRestoreConnector() state_dict = torch.load(state_dict_path) if megatron_amp_O2: new_state_dict = {} for key in state_dict.keys(): new_key = key.replace("model.", "model.module.", 1) new_state_dict[new_key] = state_dict[key] state_dict = new_state_dict model.load_state_dict(state_dict) model.save_to(nemo_file_path) if __name__ == "__main__": parser = ArgumentParser() parser.add_argument( "--hf_model_name", type=str, required=True, help="Valid Huggingface T5v1_1 model name ex: google/t5-v1_1-large or google/ul2. Example something that can be loaded with T5ForConditionalGeneration.from_pretrained()", ) parser.add_argument( "--nemo_state_dict_path", type=str, required=True, help="Path to write the intermediate nemo state dict file ex: /path/to/nemo_state_dict.pt", ) parser.add_argument( "--nemo_file_path", type=str, required=True, help="Path to write the converted .nemo file ex: /path/to/t5_base_converted_to_nemo.nemo", ) parser.add_argument( "--base_yaml_config", type=str, default="hf_t5v1_1_base_config.yaml", help="Path to a base yaml config that we edit based on the provided model.", ) parser.add_argument( "--megatron_amp_O2", action="store_true", help="Whether to store O2 weights. This may be useful for models like ul2 where only pre-trained half precision weights were released.", ) args = parser.parse_args() if not os.path.exists(args.base_yaml_config): raise FileNotFoundError(f"Base yaml config file {args.base_yaml_config} does not exist.") hf_model_config = convert_weights(args.hf_model_name, args.nemo_state_dict_path) package_into_nemo_file( state_dict_path=args.nemo_state_dict_path, base_yaml_config=args.base_yaml_config, hf_model_config=hf_model_config, nemo_file_path=args.nemo_file_path, hf_model_name=args.hf_model_name, megatron_amp_O2=args.megatron_amp_O2, )