diff --git a/README.md b/README.md
index 2c03bbb76bc3c1caa49572d33b1eaef37265e29f..2e065a861053a770177e2cb3b1b17bac2e6c8cdf 100644
--- a/README.md
+++ b/README.md
@@ -8,19 +8,3 @@ sdk_version: 3.38.0
 app_file: app_dialogue.py
 pinned: false
 ---
-
-# M4 Visualization
-
-For visualizations, we have a main [app](https://huggingface.co/spaces/HuggingFaceM4/m4-demo) which calls multiple child apps to retrieve generations via [Gradio API](https://gradio.app/using-blocks-like-functions/). This allows us to parallelize calls to multiple models at the same time instead of running them sequentially.
-
-
-## How to?
-
-The process of adding a model to the main space:
-
-- Use `huggingface-cli login` to login with an auth token that has a read/write access to the `HuggingFaceM4` org on the hub.
-- Use `./upload_checkpoint_to_hub_gcs.sh` script to upload a checkpoint from GCP store to the hub. An example command to upload checkpoint for step 3000 from `tr_121ter` to the hub: `./m4/visualization/upload_checkpoint_to_hub_gcs.sh gs://hf-science-m4-cold/local_experiment_dir/tr_121ter/opt_step-3000`. This will create model repo under the `HuggingFaceM4` repo on the hub. If you are on the cluster, use `./upload_checkpoint_to_hub_s3.sh` instead. I recommend being on a compute node to avoid disk space issues (uploading to the hub consists in downloading locally the checkpoint, creating a repo on the hub, copying it locally, filling it with the weights and commiting the weights to the hub repo).
-- [MANUAL] Go to the hub, create a repo of type `space` with the same name as the model. In the space's settings, add a secret `HF_AUTH_TOKEN` with a token which has read access to the `HuggingFaceM4` repo. This step can be potentially automated in the future.
-- [MANUAL] Edit `m4/visualization/app_dialogue.py`'s three dictionary to include your model in the existing formats of those dictionaries.
-- Run `m4/visualization/sync-repo.sh <name_of_the_space_on_the_hub>` to sync the repo with the local setting. This will automatically update the space to have the latest code as in the `m4/visualization/app_dialogue.py`.
-- Run `m4/visualization/sync-repo.sh main` to update the main repo as well with the new model.
diff --git a/app.py b/app.py
deleted file mode 100644
index d005c19b6144604da6b0a63bd5d2185710269abe..0000000000000000000000000000000000000000
--- a/app.py
+++ /dev/null
@@ -1,793 +0,0 @@
-import logging
-import os
-import re
-import time
-from io import BytesIO
-
-import gradio as gr
-import requests
-import torch
-import transformers
-from accelerate.utils import get_max_memory
-from joblib import Parallel, delayed
-from PIL import Image
-from transformers import AutoTokenizer
-
-from m4.models.vbloom import configuration_vbloom, modeling_vbloom
-from m4.models.vgpt2 import configuration_vgpt2, modeling_vgpt2
-from m4.models.vgpt_neo import configuration_vgpt_neo, modeling_vgpt_neo
-from m4.models.vllama import configuration_vllama, modeling_vllama
-from m4.models.vopt import configuration_vopt, modeling_vopt
-from m4.training.packing import image_attention_mask_for_packed_input_ids, incremental_to_binary_attention_mask
-from m4.training.utils import build_image_transform
-
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-
-CURRENT_MODEL = "<replace_model>"
-# CURRENT_MODEL = "tr_177_4datasets_alpha_baseline_opt_step-2000"
-
-MAX_TRIES = 3
-TOKENIZER_FAST = True
-MAX_SEQ_LEN = 1024
-model, tokenizer = None, None
-
-
-MODEL_TO_DISPLAY_NAME = {
-    "tr_199_w_xattn_opt_step-65000": "VLlama - tr_199_w_xattn_opt_step-65000",
-    # "tr_201_sft_on_lrv_opt_step-15000": "VLlama - tr_201_sft_on_lrv_opt_step-15000",
-    # "tr_202bis_ift_llava_all_unfrozen_opt_step-14128": "VLlama - tr_202bis_ift_llava_all_unfrozen_opt_step-14128",
-    # "tr_203_ift_m3it_opt_step-50000": "VLlama - tr_203_ift_m3it_opt_step-50000",
-    # "tr_205_sft_ultrachat_opt_step-20000": "VLlama - tr_205_sft_ultrachat_opt_step-20000",
-    # "tr_207_ift_svit_opt_step-14627": "VLlama - tr_207_ift_svit_opt_step-14627",
-    "tr_209_ift_mixture_opt_step-14000": "VLlama - tr_209_ift_mixture_opt_step-14000",
-}
-MODEL_TO_MODEL_CLASS = {
-    "tr_199_w_xattn_opt_step-65000": "VLlamaForCausalLM",
-    # "tr_201_sft_on_lrv_opt_step-15000": "VLlamaForCausalLM",
-    # "tr_202bis_ift_llava_all_unfrozen_opt_step-14128": "VLlamaForCausalLM",
-    # "tr_203_ift_m3it_opt_step-50000": "VLlamaForCausalLM",
-    # "tr_205_sft_ultrachat_opt_step-20000": "VLlamaForCausalLM",
-    # "tr_207_ift_svit_opt_step-14627": "VLlamaForCausalLM",
-    "tr_209_ift_mixture_opt_step-14000": "VLlamaForCausalLM",
-}
-
-MODEL_TO_CONFIG_CLASS = {
-    "tr_199_w_xattn_opt_step-65000": "VLlamaConfig",
-    # "tr_201_sft_on_lrv_opt_step-15000": "VLlamaConfig",
-    # "tr_202bis_ift_llava_all_unfrozen_opt_step-14128": "VLlamaConfig",
-    # "tr_203_ift_m3it_opt_step-50000": "VLlamaConfig",
-    # "tr_205_sft_ultrachat_opt_step-20000": "VLlamaConfig",
-    # "tr_207_ift_svit_opt_step-14627": "VLlamaConfig",
-    "tr_209_ift_mixture_opt_step-14000": "VLlamaConfig",
-}
-
-
-def load_tokenizer_model(model_name, model_class):
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_name,
-        use_fast=TOKENIZER_FAST,
-        use_auth_token=os.getenv("HF_AUTH_TOKEN", True),  # `use_fast=False` for 1B3 OPT, True for all the other models
-    )
-    tokenizer.padding_side = "left"
-    config_class = MODEL_TO_CONFIG_CLASS[model_name.split("/")[-1]]
-
-    # assert tokenizer.is_fast
-
-    supported_custom_modules = {
-        "vgpt2": modeling_vgpt2,
-        "vbloom": modeling_vbloom,
-        "vgptneo": modeling_vgpt_neo,
-        "vopt": modeling_vopt,
-        "vllama": modeling_vllama,
-    }
-    supported_custom_configs = {
-        "vgpt2": configuration_vgpt2,
-        "vbloom": configuration_vbloom,
-        "vgptneo": configuration_vgpt_neo,
-        "vopt": configuration_vopt,
-        "vllama": configuration_vllama,
-    }
-    parent_config_class = (
-        [v for k, v in supported_custom_configs.items() if k in model_class.lower()] + [transformers]
-    )[0]
-    parent_model_class = (
-        [v for k, v in supported_custom_modules.items() if k in model_class.lower()] + [transformers]
-    )[0]
-    config_class = getattr(parent_config_class, config_class)
-    model_class = getattr(parent_model_class, model_class)
-    config = config_class.from_pretrained(model_name, use_auth_token=os.getenv("HF_AUTH_TOKEN", True))
-    max_memory_map = get_max_memory()
-    for key in max_memory_map.keys():
-        if key != "cpu":
-            # Get this in GB
-            max_memory_map[key] = max_memory_map[key] // (1024 * 1024 * 1024)
-            # Decrease 2 for Pytorch overhead and 2 for the forward to be safe
-            max_memory_map[key] = f"{max_memory_map[key] - 4} GiB"
-    model = model_class.from_pretrained(
-        model_name,
-        use_auth_token=os.getenv("HF_AUTH_TOKEN", True),
-        device_map="auto",
-        offload_folder="./offload",
-        torch_dtype=config.torch_dtype,
-        max_memory=max_memory_map,
-    )
-    model.eval()
-    print("Current device map:", model.hf_device_map)
-    print("Model default generation config:", model.generation_config)
-    # TODO: the device_map looks very inefficien right now. that could be improved
-    # it typically looks like that
-    # {
-    #     'model.embed_tokens': 0,
-    #     'model.vision_model': 0,
-    #     'model.layers.0': 0,
-    #     'model.layers.1': 0,
-    #     'model.layers.2': 0,
-    #     'model.layers.3': 0,
-    #     'model.layers.4': 0,
-    #     'model.layers.5': 0,
-    #     'model.layers.6': 1,
-    #     'model.layers.7': 1,
-    #     'model.layers.8': 1,
-    #     'model.layers.9': 1,
-    #     'model.layers.10': 1,
-    #     'model.layers.11': 1,
-    #     'model.layers.12': 1,
-    #     'model.layers.13': 1,
-    #     'model.layers.14': 1,
-    #     'model.layers.15': 1,
-    #     'model.layers.16': 1,
-    #     'model.layers.17': 2,
-    #     'model.layers.18': 2,
-    #     'model.layers.19': 2,
-    #     'model.layers.20': 2,
-    #     'model.layers.21': 2,
-    #     'model.layers.22': 2,
-    #     'model.layers.23': 2,
-    #     'model.layers.24': 2,
-    #     'model.layers.25': 2,
-    #     'model.layers.26': 2,
-    #     'model.layers.27': 2,
-    #     'model.layers.28': 3,
-    #     'model.layers.29': 3,
-    #     'model.layers.30': 3,
-    #     'model.layers.31': 3,
-    #     'model.gated_cross_attn_layers.0': 3,
-    #     'model.gated_cross_attn_layers.1': 3,
-    #     'model.gated_cross_attn_layers.2': 3,
-    #     'model.gated_cross_attn_layers.3': 3,
-    #     'model.gated_cross_attn_layers.4': 3,
-    #     'model.gated_cross_attn_layers.5': 3,
-    #     'model.gated_cross_attn_layers.6': 3,
-    #     'model.gated_cross_attn_layers.7': 3,
-    #     'model.gated_cross_attn_layers.8': 4,
-    #     'model.gated_cross_attn_layers.9': 4,
-    #     'model.gated_cross_attn_layers.10': 4,
-    #     'model.gated_cross_attn_layers.11': 4,
-    #     'model.gated_cross_attn_layers.12': 4,
-    #     'model.gated_cross_attn_layers.13': 4,
-    #     'model.gated_cross_attn_layers.14': 4,
-    #     'model.gated_cross_attn_layers.15': 4,
-    #     'model.norm': 4,
-    #     'lm_head': 4
-    # }    which means there is a lot of things going around between the gated cross attention layers and the LM layers...
-    return tokenizer, model
-
-
-MODEL_TO_SPACE_MAPPING = {}
-IS_MAIN_SPACE = CURRENT_MODEL not in MODEL_TO_MODEL_CLASS
-if IS_MAIN_SPACE:
-    for model in MODEL_TO_MODEL_CLASS:
-        MODEL_TO_SPACE_MAPPING[model] = gr.Blocks.load(
-            name=f"spaces/HuggingFaceM4/{model}", api_key=os.getenv("HF_AUTH_TOKEN", True)
-        )
-else:
-    model_path = f"HuggingFaceM4/{CURRENT_MODEL}"
-    tokenizer, model = load_tokenizer_model(model_path, MODEL_TO_MODEL_CLASS[CURRENT_MODEL])
-
-
-def fetch_images(url_images):
-    images = []
-    for url in url_images:
-        if isinstance(url, str):
-            images.append(Image.open(BytesIO(requests.get(url, stream=True).content)))
-        else:
-            images.append(url)
-    return images
-
-
-def model_generation(
-    prompt,
-    images,
-    tokenizer,
-    model,
-    temperature,
-    no_repeat_ngram_size,
-    max_new_tokens,
-    min_length,
-    ban_tokens,
-    eos_tokens,
-    force_words,
-    repetition_penalty,
-    hide_special_tokens,
-    decoding_strategy,
-    num_beams,
-    length_penalty,
-    top_k,
-    top_p,
-    penalty_alpha,
-):
-    # Preparing inputs
-    tokens = tokenizer(
-        [prompt],
-        truncation=True,
-        max_length=MAX_SEQ_LEN,
-        padding=True,
-        add_special_tokens=False,
-    )
-
-    input_ids = torch.tensor([[tokenizer.bos_token_id] + tokens.input_ids[0]])
-    attention_mask = torch.tensor([[1] + tokens.attention_mask[0]])
-
-    image_attention_mask = [
-        incremental_to_binary_attention_mask(
-            image_attention_mask_for_packed_input_ids(input_ids[0].unsqueeze(0), tokenizer)[0], num_classes=len(images)
-        )
-    ]
-
-    image_transform = build_image_transform(eval=True)
-    pixel_values = [torch.stack([image_transform(img) for img in images])]
-
-    input_ids = input_ids.to(0)
-    attention_mask = attention_mask.to(0)
-    pixel_values = torch.stack(pixel_values).to(0)
-    image_attention_mask = torch.cat(image_attention_mask, 0).to(0)
-
-    # Excluding some words from the generation
-    bad_words_ids = None
-    ban_tokens = ban_tokens.replace("\\n", "\n")
-    bad_words = ban_tokens.split(";")
-    if len(bad_words) > 0:
-        bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids
-
-    # Forcing some words in the generation
-    force_words_ids = None
-    if force_words != "":
-        force_words = force_words.replace("\\n", "\n")
-        force_words = force_words.split(";")
-        if len(force_words) > 0:
-            force_words_ids = tokenizer(force_words, add_special_tokens=False).input_ids
-
-    eos_token_ids = None
-    if eos_tokens != "":
-        eos_tokens = eos_tokens.replace("\\n", "\n")
-        eos_tokens = eos_tokens.split(";")
-        if len(eos_tokens) > 0:
-            eos_token_ids = []
-            for eos_token in eos_tokens:
-                tokenized_eos_token = tokenizer(eos_token, add_special_tokens=False).input_ids
-                if len(tokenized_eos_token) > 1:
-                    raise ValueError(
-                        f"eos_tokens should be one token, here {eos_token} is {len(tokenized_eos_token)} tokens:"
-                        f" {tokenized_eos_token}"
-                    )
-                eos_token_ids += tokenized_eos_token
-
-    # Inputs
-    input_args = {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "pixel_values": pixel_values,
-        "image_attention_mask": image_attention_mask,
-    }
-    # Common parameters to all decoding strategies
-    # This documentation is useful to read: https://huggingface.co/docs/transformers/main/en/generation_strategies
-    generation_args = {
-        "temperature": temperature,
-        "no_repeat_ngram_size": no_repeat_ngram_size,
-        "max_new_tokens": max_new_tokens,
-        "min_length": min_length,
-        "bad_words_ids": bad_words_ids,
-        "force_words_ids": force_words_ids,
-        "repetition_penalty": repetition_penalty,
-        "eos_token_id": eos_token_ids,
-    }
-
-    assert decoding_strategy in [
-        "greedy",
-        "beam_search",
-        "beam_sampling",
-        "sampling_top_k",
-        "sampling_top_p",
-        "contrastive_sampling",
-    ]
-    if decoding_strategy == "greedy":
-        pass
-    elif decoding_strategy == "beam_search":
-        generation_args["num_beams"] = num_beams
-        generation_args["length_penalty"] = length_penalty
-        assert generation_args["num_beams"] > 1
-    elif decoding_strategy == "beam_sampling":
-        generation_args["num_beams"] = num_beams
-        generation_args["length_penalty"] = length_penalty
-        generation_args["do_sample"] = True
-        assert generation_args["num_beams"] > 1
-    elif decoding_strategy == "sampling_top_k":
-        generation_args["do_sample"] = True
-        generation_args["top_k"] = top_k
-    elif decoding_strategy == "sampling_top_p":
-        generation_args["do_sample"] = True
-        generation_args["top_p"] = top_p
-    elif decoding_strategy == "contrastive_sampling":
-        generation_args["do_sample"] = True
-        generation_args["penalty_alpha"] = penalty_alpha
-        generation_args["top_k"] = top_k
-
-    generated_tokens = model.generate(
-        **input_args,
-        **generation_args,
-    )
-    tokens = tokenizer.convert_ids_to_tokens(generated_tokens[0])
-    decoded_skip_special_tokens = repr(
-        tokenizer.batch_decode(generated_tokens, skip_special_tokens=hide_special_tokens)[0]
-    )
-    decoded = repr(tokenizer.batch_decode(generated_tokens)[0])
-    logger.info(
-        "Result: \n"
-        f"Prompt: `{prompt}`\n"
-        f"Tokens ids from prompt + generation: `{generated_tokens[0].tolist()}`\n"
-        f"Tokens (converted) from prompt + generation: `{tokens}`\n"
-        f"String decoded with skipped special tokens: `{decoded_skip_special_tokens}`\n"
-        f"String decoded: `{decoded}`\n"
-        f"Generation mode: `{decoding_strategy}`\n"
-        f"Generation parameters: `{generation_args}`\n"
-    )
-
-    original_prompt = generated_tokens[:, : input_ids.shape[-1]]
-    actual_generated_tokens = generated_tokens[:, input_ids.shape[-1] :]
-
-    first_end_token = len(actual_generated_tokens[0])
-    actual_generated_tokens = actual_generated_tokens[:, :first_end_token]
-    displayed_tokens = torch.cat([original_prompt, actual_generated_tokens], dim=-1)
-    generated_text = tokenizer.batch_decode(displayed_tokens, skip_special_tokens=hide_special_tokens)[0]
-    return generated_text
-
-
-def model_inference(
-    files,
-    prompt,
-    temperature,
-    no_repeat_ngram_size,
-    max_new_tokens,
-    min_length,
-    ban_tokens,
-    eos_tokens,
-    force_words,
-    repetition_penalty,
-    hide_special_tokens,
-    decoding_strategy,
-    num_beams,
-    length_penalty,
-    top_k,
-    top_p,
-    penalty_alpha,
-):
-    if isinstance(files, str) and len(files) == 0:
-        files = None
-
-    prompt = prompt.strip()
-    prompt = prompt.replace("\\n", "\n")
-    file_idx = 0
-    url_images = re.findall(r"<image(.*?)>", prompt)
-    for idx, url_image in enumerate(url_images):
-        if len(url_image) == 0:
-            url_images[idx] = Image.open(files[file_idx].name if hasattr(files[file_idx], "name") else files[file_idx])
-            file_idx += 1
-        else:
-            prompt = prompt.replace(url_image, "")
-            url_images[idx] = url_images[idx][1:]
-    images = fetch_images(url_images)
-
-    global model, tokenizer
-
-    generated_text = model_generation(
-        prompt=prompt,
-        images=images,
-        tokenizer=tokenizer,
-        model=model,
-        temperature=temperature,
-        no_repeat_ngram_size=no_repeat_ngram_size,
-        max_new_tokens=max_new_tokens,
-        min_length=min_length,
-        ban_tokens=ban_tokens,
-        eos_tokens=eos_tokens,
-        force_words=force_words,
-        repetition_penalty=repetition_penalty,
-        hide_special_tokens=hide_special_tokens,
-        decoding_strategy=decoding_strategy,
-        num_beams=num_beams,
-        length_penalty=length_penalty,
-        top_k=top_k,
-        top_p=top_p,
-        penalty_alpha=penalty_alpha,
-    )
-    return generated_text.strip()
-
-
-def try_model_inference(
-    model,
-    files,
-    prompt,
-    temperature,
-    no_repeat_ngram_size,
-    max_new_tokens,
-    min_length,
-    ban_tokens,
-    eos_tokens,
-    force_words,
-    repetition_penalty,
-    hide_special_tokens,
-    decoding_strategy,
-    num_beams,
-    length_penalty,
-    top_k,
-    top_p,
-    penalty_alpha,
-):
-    count = 0
-    while count < MAX_TRIES:
-        try:
-            return MODEL_TO_SPACE_MAPPING[model](
-                files,
-                prompt,
-                temperature,
-                no_repeat_ngram_size,
-                max_new_tokens,
-                min_length,
-                ban_tokens,
-                eos_tokens,
-                force_words,
-                repetition_penalty,
-                hide_special_tokens,
-                decoding_strategy,
-                num_beams,
-                length_penalty,
-                top_k,
-                top_p,
-                penalty_alpha,
-                api_name="model_inference",
-            )
-        except KeyError:
-            # Gradio return {'error': None} some times.
-            time.sleep(3)
-            count += 1
-            pass
-
-
-def all_model_inference(
-    prompt,
-    temperature,
-    no_repeat_ngram_size,
-    max_new_tokens,
-    min_length,
-    ban_tokens,
-    eos_tokens,
-    force_words,
-    repetition_penalty,
-    hide_special_tokens,
-    decoding_strategy,
-    num_beams,
-    length_penalty,
-    top_k,
-    top_p,
-    penalty_alpha,
-):
-    outputs = []
-    print(
-        prompt,
-        temperature,
-        no_repeat_ngram_size,
-        max_new_tokens,
-        min_length,
-        ban_tokens,
-        eos_tokens,
-        force_words,
-        repetition_penalty,
-        hide_special_tokens,
-        decoding_strategy,
-        num_beams,
-        length_penalty,
-        top_k,
-        top_p,
-        penalty_alpha,
-    )
-    outputs = Parallel(n_jobs=len(MODEL_TO_SPACE_MAPPING), backend="threading")(
-        delayed(try_model_inference)(
-            model,
-            os.path.join(os.path.dirname(__file__), "images", "bear.jpg"),
-            prompt,
-            temperature,
-            no_repeat_ngram_size,
-            max_new_tokens,
-            min_length,
-            ban_tokens,
-            eos_tokens,
-            force_words,
-            repetition_penalty,
-            hide_special_tokens,
-            decoding_strategy,
-            num_beams,
-            length_penalty,
-            top_k,
-            top_p,
-            penalty_alpha,
-        )
-        for model in MODEL_TO_SPACE_MAPPING
-    )
-    if len(outputs) == 1:
-        outputs = outputs[0]
-    return outputs
-
-
-examples = [
-    [
-        None,
-        """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
-
-The conversation begins:
-User:<fake_token_around_image><image:https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg><fake_token_around_image>Describe this image.</s>
-Assistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitte, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.</s>
-User:How about this image?<fake_token_around_image><image:https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg><fake_token_around_image>Can you describe it too?</s>
-Assistant: A dog is lying on the floor, looking at the camera. It is looking directly at you. The dog has a white body and brown patches on its face and ears. Its eyes are dark. Its nose is black, and it has long, floppy ears, white paws, long fur, big eyes.</s>
-User: What kind of breed is it?</s>
-Assistant: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.</s>
-User: What can you tell me about this breed of dogs?</s>
-Assistant: One specific characteristic of the Cavalier King Charles Spaniel is its friendly and affectionate nature. This breed is known for being extremely sociable and forming strong bonds with their owners. They are often described as "velcro dogs" because they love to be close to their human companions, whether it's sitting on their laps, cuddling on the couch, or simply following them around the house.</s>
-User: ghjkhjabnufs</s>Assistant: That doesn’t seem to be a word. Could you ask me another way?</s>
-User: Do you like Cavalier King Charles Spaniel?</s>
-Assistant: I do not have personal opinions as I’m just a computer program. However, cavaliers are known for being excellent family pets due to their gentle and patient demeanor, making them great with children and other pets. Their affectionate nature also makes them ideal therapy dogs, as they can provide comfort and emotional support to those in need.</s>
-User: How many dogs do you see in this image?<fake_token_around_image><image:https://i.dailymail.co.uk/i/pix/2011/07/01/article-2010308-0CD22A8300000578-496_634x414.jpg><fake_token_around_image></s>
-Assistant: There is no dogs in this image. The picture shows a tennis player jumping to volley the ball.</s>
-User:<fake_token_around_image><image:https://m.media-amazon.com/images/M/MV5BMjE4MTcwMTM1Nl5BMl5BanBnXkFtZTcwMTIwMzMzMw@@._V1_.jpg><fake_token_around_image>Describe all of the parts of this image.</s>
-Assistant:""",
-        1.0,
-        0,
-        256,
-        10,
-        "<image>;<fake_token_around_image>",
-        "</s>;User",
-        "",
-        1.0,
-        False,
-        "greedy",
-        1,
-        1.0,
-        50,
-        0.5,
-        0.95,
-    ],
-    #     [
-    #         None,
-    #         """This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answer the questions from the user.
-    # User: <fake_token_around_image><image:https://m.media-amazon.com/images/M/MV5BMjE4MTcwMTM1Nl5BMl5BanBnXkFtZTcwMTIwMzMzMw@@._V1_.jpg><fake_token_around_image>
-    # Describe this image.
-    # Bot:""",
-    #         1,
-    #         2,
-    #         64,
-    #         10,
-    #         "<image>;<fake_token_around_image>;User;user;Bot;bot;Question;question;Answer;answer;\n",
-    #         False,
-    #         False,
-    #         True,
-    #     ],
-    #     [
-    #         None,
-    #         """This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answer the questions from the user.
-    # User: <fake_token_around_image><image:https://i.redd.it/hsktcp4nv1g01.jpg><fake_token_around_image>
-    # Why do people find this image funny?
-    # Bot:""",
-    #         1,
-    #         2,
-    #         64,
-    #         10,
-    #         "<image>;<fake_token_around_image>;User;user;Bot;bot;Question;question;Answer;answer;\n",
-    #         False,
-    #         False,
-    #         True,
-    #     ],
-    #     [
-    #         None,
-    #         """This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answer the questions from the user.
-    # User: <fake_token_around_image><image:https://pbs.twimg.com/media/FooD7oyakAIU5_Q?format=jpg&name=large><fake_token_around_image>
-    # Describe what's in this image.
-    # Bot:""",
-    #         1,
-    #         2,
-    #         64,
-    #         10,
-    #         "<image>;<fake_token_around_image>;User;user;Bot;bot;Question;question;Answer;answer;\n",
-    #         False,
-    #         False,
-    #         True,
-    #     ],
-    #     [
-    #         None,
-    #         """This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answer the questions from the user.
-    # User: <fake_token_around_image><image:https://www.tutorialride.com/images/non-verbal-analogy-questions/non-verbal-analogy-logical-reasoning-1.jpg><fake_token_around_image>
-    # What's the correct answer? A, B, C or D?
-    # Bot:""",
-    #         1,
-    #         2,
-    #         64,
-    #         10,
-    #         "<image>;<fake_token_around_image>;User;user;Bot;bot;Question;question;Answer;answer;\n",
-    #         False,
-    #         False,
-    #         True,
-    #     ],
-]
-
-
-MSG_MAIN = """
-# Text generation with Vllama models
-
-### Help to write prompts:
-
-Put the urls to the images inside the image tokens, it will be converted into the real image tokens. Put <fake_token_around_image> before and after each image token WITHOUT space. The texts \\n will be converted into real newline characters. See examples and additional details below.
-"""
-# MSG_DETAILS = """
-# ### Additional details
-# - if the model was trained with the template 1 (`\\n\\n<image>\\n\\n`), then `<fake_token_around_image>` will be replaced with `\\n\\n`. This is particularly useful if you are comparing the performance of different models trained with different templates.
-# - special tokens are not automatically added to the prompt, so add them manually.
-# - with the first template `\\n\\n<image>\\n\\n` , the sequence isn't necessary tokenized as `["\\n\\n", "<image>", "\\n\\n"]` to enforce this behavior, you can use the "Integrate image sequence as ids" parameter.
-# """
-# if ~IS_MAIN_SPACE:
-#     MSG_DETAILS += (
-#         "- alternatively, you can upload images and then directly specify them via \<image\> tag in the prompt."
-#     )
-
-with gr.Blocks() as demo:
-    gr.Markdown(MSG_MAIN)
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("## Input")
-            if not IS_MAIN_SPACE:
-                images = gr.File(label="Images", file_count="multiple")
-            prompt = gr.Textbox(label="Prompt", placeholder="Enter the prompt here")
-
-            gr.Markdown("## Common parameters to all decoding strategy")
-            temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=1.0, label="Softmax temperature")
-            no_repeat_ngram_size = gr.Slider(
-                minimum=0,
-                maximum=10,
-                step=1,
-                value=0,
-                label="The size of an n-gram that cannot occur more than once (0=infinity)",
-            )
-            max_new_tokens = gr.Slider(
-                minimum=0, maximum=512, step=1, value=256, label="Maximum number of new tokens to generate"
-            )
-            min_length = gr.Slider(
-                minimum=0, maximum=512, step=1, value=16, label="Minimum length of the sequence to be generated"
-            )
-            ban_tokens = gr.Textbox(
-                label='Tokens to prevent from being generated (separated by ";")',
-                value="<image>;<fake_token_around_image>",
-            )
-            eos_tokens = gr.Textbox(label="EOS tokens", value="</s>")
-            force_words = gr.Textbox(label='Force words to be generated (separated by ";")', value="")
-            repetition_penalty = gr.Slider(
-                minimum=0, maximum=10, step=0.01, value=1, label="repetition_penalty. CTRL paper suggests 1.2."
-            )
-            hide_special_tokens = gr.Checkbox(label="Hide special tokens in the text", value=False)
-
-            gr.Markdown("## Decoding strategy and its specific parameters")
-            decoding_strategy = gr.Dropdown(
-                ["greedy", "beam_search", "beam_sampling", "sampling_top_k", "sampling_top_p", "contrastive_sampling"],
-                label="Decoding strategy",
-                value="greedy",
-            )
-            num_beams = gr.Slider(
-                minimum=0,
-                maximum=10,
-                step=1,
-                value=3,
-                label="Beam size",
-                info="Only used if `decoding_strategy` is `beam_search` or `beam_sampling`",
-            )
-            length_penalty = gr.Slider(
-                minimum=-1000,
-                maximum=1000,
-                step=0.1,
-                value=1,
-                label=(
-                    "length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter"
-                    " sequences. Only used if `decoding_strategy` is `beam_search` or `beam_sampling`"
-                ),
-            )
-            top_k = gr.Slider(
-                minimum=0,
-                maximum=500,
-                step=1,
-                value=50,
-                label="Top k",
-                info="Only used if `decoding_strategy` is `sampling_top_k` or `contrastive_sampling`",
-            )
-            top_p = gr.Slider(
-                minimum=0,
-                maximum=1,
-                step=0.01,
-                value=0.95,
-                label="Top p",
-                info="Only used if `decoding_strategy` is `sampling_top_p`",
-            )
-            penalty_alpha = gr.Slider(
-                minimum=0,
-                maximum=1,
-                step=0.01,
-                value=0.95,
-                label="Penalty alpha",
-                info="Only used if `decoding_strategy` is `contrastive_sampling`",
-            )
-
-            submit = gr.Button(label="Generate")
-
-        with gr.Column():
-            if IS_MAIN_SPACE:
-                outputs = [
-                    gr.Textbox(label=MODEL_TO_DISPLAY_NAME[model], multiline=True, readonly=True)
-                    for model in MODEL_TO_MODEL_CLASS
-                ]
-                inference_func = all_model_inference
-                inputs = [
-                    prompt,
-                    temperature,
-                    no_repeat_ngram_size,
-                    max_new_tokens,
-                    min_length,
-                    ban_tokens,
-                    eos_tokens,
-                    force_words,
-                    repetition_penalty,
-                    hide_special_tokens,
-                    decoding_strategy,
-                    num_beams,
-                    length_penalty,
-                    top_k,
-                    top_p,
-                    penalty_alpha,
-                ]
-
-                examples = [example[1:] for example in examples]
-            else:
-                outputs = gr.Textbox(label="Generated text", interactive=False)
-                inference_func = model_inference
-                inputs = [
-                    images,
-                    prompt,
-                    temperature,
-                    no_repeat_ngram_size,
-                    max_new_tokens,
-                    min_length,
-                    ban_tokens,
-                    eos_tokens,
-                    force_words,
-                    repetition_penalty,
-                    hide_special_tokens,
-                    decoding_strategy,
-                    num_beams,
-                    length_penalty,
-                    top_k,
-                    top_p,
-                    penalty_alpha,
-                ]
-    with gr.Row():
-        gr.Examples(inputs=inputs, examples=examples)
-        # gr.Markdown(MSG_DETAILS)
-
-        submit.click(inference_func, inputs=inputs, outputs=outputs, api_name="model_inference")
-
-demo.queue()
-demo.launch()
diff --git a/m4/__init__.py b/m4/__init__.py
deleted file mode 100644
index 1fc6df20b20f1e897203857d934c3915165c19c3..0000000000000000000000000000000000000000
--- a/m4/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from m4.utils import logging
diff --git a/m4/models/__init__.py b/m4/models/__init__.py
deleted file mode 100644
index cb2277c14721e21c31fc850834931f43eda390c1..0000000000000000000000000000000000000000
--- a/m4/models/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from m4.models.custom_modules import DecoupledEmbedding, DecoupledLinear
-from m4.models.vbloom.configuration_vbloom import VBloomConfig
-from m4.models.vbloom.modeling_vbloom import VBloomForCausalLM
-from m4.models.vgpt2.configuration_vgpt2 import VGPT2Config
-from m4.models.vgpt2.modeling_vgpt2 import VGPT2LMHeadModel
-from m4.models.vllama.configuration_vllama import VLlamaConfig
-from m4.models.vllama.modeling_vllama import VLlamaForCausalLM
-from m4.models.vopt.configuration_vopt import VOPTConfig
-from m4.models.vopt.modeling_vopt import VOPTForCausalLM
-from m4.models.vt5.configuration_vt5 import VT5Config
-from m4.models.vt5.modeling_vt5 import VT5ForConditionalGeneration
-
-
-_SUPPORTED_MODELS = {
-    "vgpt2": VGPT2Config,
-    "vt5": VT5Config,
-    "vbloom": VBloomConfig,
-    "vopt": VOPTConfig,
-    "vllama": VLlamaConfig,
-}
-
-model_type_to_modeling_class = {
-    "vgpt2": VGPT2LMHeadModel,
-    "vt5": VT5ForConditionalGeneration,
-    "vbloom": VBloomForCausalLM,
-    "vopt": VOPTForCausalLM,
-    "vllama": VLlamaForCausalLM,
-}
diff --git a/m4/models/common.py b/m4/models/common.py
deleted file mode 100644
index d2dd894bdc9845ec49abcf325d85517e48b480e7..0000000000000000000000000000000000000000
--- a/m4/models/common.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import torch
-
-
-def expand_inputs_for_generation(
-    input_ids,
-    expand_size=1,
-    is_encoder_decoder=False,
-    attention_mask=None,
-    encoder_outputs=None,
-    **model_kwargs,
-):
-    expanded_return_idx = (
-        torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
-    )
-    input_ids = input_ids.index_select(0, expanded_return_idx)
-
-    if "token_type_ids" in model_kwargs:
-        token_type_ids = model_kwargs["token_type_ids"]
-        model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx)
-
-    if attention_mask is not None:
-        model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
-        model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
-            0, expanded_return_idx
-        )
-        model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)
-
-    if is_encoder_decoder:
-        if encoder_outputs is None:
-            raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
-        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.index_select(
-            0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device)
-        )
-        model_kwargs["encoder_outputs"] = encoder_outputs
-    return input_ids, model_kwargs
-
-
-def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
-    # must have this key set to at least None
-    model_kwargs["past_key_values"] = model_kwargs.get("past_key_values", None)
-
-    # update past
-    if "past_key_values" in outputs:
-        model_kwargs["past"] = outputs.past_key_values
-    elif "mems" in outputs:
-        model_kwargs["past"] = outputs.mems
-    elif "past_buckets_states" in outputs:
-        model_kwargs["past"] = outputs.past_buckets_states
-    else:
-        model_kwargs["past"] = None
-
-    # update token_type_ids with last value
-    if "token_type_ids" in model_kwargs:
-        token_type_ids = model_kwargs["token_type_ids"]
-        model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
-
-    # update attention masks
-    if not is_encoder_decoder:
-        if "attention_mask" in model_kwargs:
-            attention_mask = model_kwargs["attention_mask"]
-            model_kwargs["attention_mask"] = torch.cat(
-                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-            )
-        if "image_attention_mask" in model_kwargs:
-            image_attention_mask = model_kwargs["image_attention_mask"]
-            last_mask = image_attention_mask[:, -1, :].unsqueeze(1)
-            model_kwargs["image_attention_mask"] = last_mask
-
-    return model_kwargs
-
-
-def prepare_inputs_for_generation(input_ids, past=None, **kwargs):
-    token_type_ids = kwargs.get("token_type_ids", None)
-    # only last token for inputs_ids if past is defined in kwargs
-    if past:
-        input_ids = input_ids[:, -1].unsqueeze(-1)
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
-
-    attention_mask = kwargs.get("attention_mask", None)
-    position_ids = kwargs.get("position_ids", None)
-
-    if attention_mask is not None and position_ids is None:
-        # create position_ids on the fly for batch generation
-        position_ids = attention_mask.long().cumsum(-1) - 1
-        position_ids.masked_fill_(attention_mask == 0, 1)
-        if past:
-            position_ids = position_ids[:, -1].unsqueeze(-1)
-
-    pixel_values = kwargs.get("pixel_values", None)
-    image_attention_mask = kwargs.get("image_attention_mask", None)
-    if pixel_values is None or image_attention_mask is None:
-        raise ValueError("pixel values and image attention mask cannot be None")
-
-    return {
-        "input_ids": input_ids,
-        "past_key_values": past,
-        "use_cache": kwargs.get("use_cache"),
-        "position_ids": position_ids,
-        "attention_mask": attention_mask,
-        "token_type_ids": token_type_ids,
-        "pixel_values": pixel_values,
-        "image_attention_mask": image_attention_mask,
-    }
diff --git a/m4/models/custom_modules.py b/m4/models/custom_modules.py
deleted file mode 100644
index dd93d389014903a9b646c2669fd2de35f4a29a18..0000000000000000000000000000000000000000
--- a/m4/models/custom_modules.py
+++ /dev/null
@@ -1,337 +0,0 @@
-import os
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel
-from transformers.utils import ContextManagers
-
-from m4.training.setup_vision_model import vision_model_name_to_model
-from m4.training.utils import (
-    deepspeed_zero_init_disabled_context_manager,
-    is_deepspeed_zero_init_enabled,
-    load_state_dict_into_model,
-)
-
-
-# from pathlib import Path
-
-
-class VLOOMPreTrainedModelBase(PreTrainedModel):
-    # The problem we are trying to solve is 2 nested zero.Init thanks to fetching from_pretrained(vision_model_name)
-    # and then one more zero.Init to override from_pretrained(vision_model_name) once again as it was done in the original - this breaks deepspeed zero3 w/ zero.Init
-    # So one solution is this:
-    # a. replace  from_pretrained(vision_model_name) with from_config(vision_model_name) while hacking to disable zero.Init context
-    # b. instead of straight replacement of model.vision_model = from_pretrained(vision_model_name) when it gets updated, we first do from_pretrained(vision_model_name) and then update the existing model with weights using the already zero.Init'ed pre-sharded weights
-    #
-    # there are a few variations to get_vision_model_from_config - all need to bypass zero.Init under zero3
-    # 1. one variant is to hack into accelerate's deepspeed_plugin and turn off zero.Init while loading the vision model
-    # 2. the other variant is to override _from_config method with our version that doesn't do zero.Init
-
-    @classmethod
-    def override_vision_model(cls, model, vision_model_name, vision_model_params, torch_dtype):
-        # 1. fetch the pretrained vision model w/o zero.Init
-        with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
-            vision_model = AutoModel.from_pretrained(vision_model_name, **vision_model_params, torch_dtype=torch_dtype)
-
-        # this extracts the desired submodule if the part we want is nested (e.g. as in clip)
-        real_vision_model = vision_model_name_to_model(vision_model_name, vision_model)
-
-        # 2. now override the weights already sharded by zero.Init with the weights from the real_vision_model
-        # by gradually gathering sharded weights and replacing with new weights
-        if is_deepspeed_zero_init_enabled():
-            state_dict = real_vision_model.state_dict()
-            load_state_dict_into_model(model.vision_model, state_dict, start_prefix="")
-        else:
-            model.vision_model = real_vision_model
-
-    @classmethod
-    def from_config(cls, config, **kwargs):
-        # torch_dtype is crucial for using the minimal amount of memory at load time
-        torch_dtype = kwargs.get("torch_dtype", None)
-
-        vision_model_name = config.vision_model_name
-        vision_model_params = eval(config.vision_model_params)
-
-        # 1. create an uninitialized vision_model to insert into the main model.
-        # It has to be created outside lm's `from_pretrained` and w/o zero.Init so that zero3+zero.Init works
-        with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
-            vision_model_config = AutoConfig.from_pretrained(vision_model_name, **vision_model_params)
-            vision_model_from_config = AutoModel.from_config(vision_model_config, torch_dtype=torch_dtype)
-        # this extracts the desired submodule if the part we want is nested (e.g. as in clip)
-        kwargs["vision_model"] = vision_model_name_to_model(vision_model_name, vision_model_from_config)
-
-        # 2. create the main class's model, passing the uninitialized vision_model to it
-        model = cls(config, **kwargs)
-
-        return model
-
-    @classmethod
-    def from_pretrained_models(cls, *args, **kwargs):
-        """
-        Use this method when creating a new vloom model that hasn't been yet trained and it'll be
-        composed of 2 pre-trained models - hence `pretrained_models`.
-        """
-
-        return cls.from_pretrained(*args, **kwargs, new_model=True)
-
-    @classmethod
-    def from_pretrained(cls, *model_args, is_resume=False, new_model=False, **kwargs):
-        """
-        Use this method when loading an already pretrained vloom model - either from a checkpoint or from hub.
-        For creating an untrained model use `pretrained_models` instead.
-        """
-
-        is_untrained_vloom_model = False
-        is_pretrained_vloom_model_resumed = False
-        is_pretrained_vloom_model_from_hub_or_path = False
-
-        # we have 3 use cases:
-        # 1. is_untrained_vloom_model - a totally new vloom model
-        # 2. is_pretrained_vloom_model_resumed - a pretrained vloom model being resumed from a
-        #    checkpoint (instantiate a random empty model in this case)
-        # 3. is_pretrained_vloom_model_from_hub_or_path - a pretrained vloom model loaded from hub or local path
-        if new_model:
-            is_untrained_vloom_model = True
-        elif is_resume:
-            is_pretrained_vloom_model_resumed = True
-        else:
-            is_pretrained_vloom_model_from_hub_or_path = True
-
-        # torch_dtype is crucial for using the minimal amount of memory at load time
-        torch_dtype = kwargs.get("torch_dtype", None)
-
-        # config is:
-        # 1. either not passed and then we use the model's default config (used by tests)
-        # 2. passed and in which case it's one of:
-        #   2a. `PretrainedConfig` (a new m4 model)
-        #   2b. path to a json config (an already pretrained m4 model, usually resumed training)
-        config = kwargs.get("config", None)
-        if config is None:
-            config = cls.config_class.from_pretrained(*model_args, **kwargs, return_unused_kwargs=False)
-        elif not isinstance(config, PretrainedConfig):
-            # adapted from https://github.com/huggingface/transformers/blob/d0acc9537829e7d067edbb791473bbceb2ecf056/src/transformers/modeling_utils.py#L1920
-            assert isinstance(config, os.PathLike)
-            config_path = str(config)
-            config = cls.config_class.from_pretrained(
-                config_path,
-                return_unused_kwargs=False,
-                **kwargs,
-            )
-
-        vision_model_name = config.vision_model_name
-        vision_model_params = eval(config.vision_model_params)
-
-        # 1. create an uninitialized vision_model to insert into the main model.
-        # It has to be created outside lm's `from_pretrained` and w/o zero.Init so that zero3+zero.Init works
-        with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
-            vision_model_config = AutoConfig.from_pretrained(vision_model_name, **vision_model_params)
-            vision_model_from_config = AutoModel.from_config(vision_model_config, torch_dtype=torch_dtype)
-        # this extracts the desired submodule if the part we want is nested (e.g. as in clip)
-        kwargs["vision_model"] = vision_model_name_to_model(vision_model_name, vision_model_from_config)
-
-        # 2. create the vloom model
-        if is_untrained_vloom_model or is_pretrained_vloom_model_from_hub_or_path:
-            model = super().from_pretrained(*model_args, **kwargs)
-        elif is_pretrained_vloom_model_resumed:
-            # in the case of resume under deepspeed we create an empty model, and get deepspeed
-            # to load the weights from the checkpoint
-            # but not all models have these keys so handle the case they don't have them
-            _ = kwargs.pop("config", None)
-            model = super().from_pretrained(None, config=config, state_dict={}, **kwargs)
-
-        # 3. if is_untrained_vloom_model, now override the uninitialized vision_model with one with pretrained weights
-        if is_untrained_vloom_model:
-            cls.override_vision_model_wrapper(model, config, vision_model_name, vision_model_params, torch_dtype)
-
-        return model
-
-
-class DecoupledEmbedding(nn.Embedding):
-    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html#Embedding
-    """
-    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings.
-    In practise, the regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0, then it will create `num_additional_embeddings` additional parameters that are always trained.
-    If `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
-    """
-
-    def __init__(
-        self,
-        num_embeddings,
-        num_additional_embeddings,
-        embedding_dim,
-        partially_freeze=False,
-        device=None,
-        dtype=None,
-        padding_idx=None,
-        **kwargs,
-    ) -> None:
-        """
-        num_additional_embeddings: int. Number of additional embeddings. Only useful when you `partially_freeze=True`.
-        partially_freeze: bool. If True, the regular `weight` will be frozen. `additional_weight` is never frozen.
-
-        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`, `max_norm` or `norm_type`. We are not supporting these.
-        """
-        if padding_idx is not None and padding_idx > num_embeddings:
-            raise ValueError(f"padding_idx must be within num_embeddings. Got {padding_idx} and {num_embeddings}")
-        super().__init__(
-            num_embeddings=num_embeddings,
-            embedding_dim=embedding_dim,
-            device=device,
-            dtype=dtype,
-            padding_idx=padding_idx,
-            **kwargs,
-        )
-        self.num_embeddings = num_embeddings
-        self.padding_idx = padding_idx
-        self.num_additional_embeddings = num_additional_embeddings
-        self.partially_freeze = partially_freeze
-
-        if partially_freeze:
-            self.weight.requires_grad_(False)
-
-        if self.num_additional_embeddings > 0:
-            self.additional_embedding = nn.Embedding(
-                num_embeddings=self.num_additional_embeddings,
-                embedding_dim=embedding_dim,
-                device=device,
-                dtype=dtype,
-            )
-
-    def forward(self, input_ids):
-        """
-        we have 2 embeddings, with different indices - one pretrained self.weight and another
-        self.additional_embedding.weight that is being trained.
-
-        in order to make a lookup of the input ids, we:
-        1. find out the indices of the entries belonging to the 2nd embedding
-        2. extract those values while subtracting the size of the first embedding (num_embeddings),
-           since the 2nd embedding starts from 0 and not num_embeddings
-        3. perform the 2nd embedding lookup
-        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
-        5. perform the 1st embedding lookup
-        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup
-
-        note: for the 1st embedding lookup we could have looked up only the low indices and not do
-        the padding, but then we have to create a new tensor and populate it with 2 tensors that are
-        spread out across various indices - i.e. not a simple concat - I haven't benchmarked the
-        complex case if it's any faster, given that seqlens are usually relatively short it's
-        probably not faster or if faster not by much - but might be a good idea to measure.
-
-        """
-        if self.num_additional_embeddings == 0:
-            return F.embedding(input_ids, self.weight)
-
-        # Clone so that we don't modify the original input_ids later on
-        input_ids = input_ids.clone()
-        additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
-        input_ids_additional_vocab = input_ids[additional_vocab_indices]
-        additional_embeddings = self.additional_embedding(input_ids_additional_vocab - self.num_embeddings)
-
-        # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
-        input_ids[additional_vocab_indices] = 0
-        full_vector = F.embedding(input_ids, self.weight)
-
-        # overwrite the records with high indices
-        full_vector[additional_vocab_indices] = additional_embeddings
-
-        return full_vector
-
-    def extra_repr(self) -> str:
-        return "num_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={}".format(
-            self.num_embeddings,
-            self.num_additional_embeddings,
-            self.embedding_dim,
-            self.partially_freeze,
-        )
-
-    @classmethod
-    def from_pretrained(cls, embeddings, freeze=True, **kwargs):
-        raise NotImplementedError
-
-
-class DecoupledLinear(nn.Linear):
-    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/linear.html#Linear
-    """
-    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters.
-    In practise, the regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0, then it will create `out_additional_features * in_features` additional parameters that are always trained.
-    If `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
-    """
-
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        out_additional_features: int = 0,
-        bias: bool = True,
-        partially_freeze: bool = True,
-        device=None,
-        dtype=None,
-    ) -> None:
-        """
-        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when `partially_freeze=True`.
-        partially_freeze: bool. If True, the regular `weight` will be frozen and extra parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
-        """
-        super().__init__(in_features, out_features, bias, device, dtype)
-        self.out_additional_features = out_additional_features
-        self.partially_freeze = partially_freeze
-
-        self.in_features = in_features
-        self.out_features = out_features
-
-        if partially_freeze:
-            self.weight.requires_grad_(False)
-            if bias:
-                self.bias.requires_grad_(False)
-
-        if out_additional_features > 0:
-            self.additional_fc = nn.Linear(
-                in_features=in_features,
-                out_features=out_additional_features,
-                bias=bias,
-                device=device,
-                dtype=dtype,
-            )
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        output = F.linear(input, self.weight, self.bias)
-
-        if self.out_additional_features > 0:
-            additional_features = F.linear(input, self.additional_fc.weight, self.additional_fc.bias)
-            output = torch.cat((output, additional_features), -1)
-
-        return output
-
-    def extra_repr(self) -> str:
-        """Overwriting `nn.Linear.extra_repr` to include new parameters."""
-        return "in_features={}, out_features={}, out_additional_features={}, bias={}, partially_freeze={}".format(
-            self.in_features,
-            self.out_features,
-            self.out_additional_features,
-            self.bias is not None,
-            self.partially_freeze,
-        )
-
-
-if __name__ == "__main__":
-    emb = DecoupledEmbedding(num_embeddings=10, num_additional_embeddings=3, embedding_dim=5, partially_freeze=True)
-    for n, p in emb.named_parameters():
-        print(n, p.requires_grad)
-    idx = torch.tensor([[11, 1, 3]])
-    y = emb(idx)
-    loss = y.sum()
-    loss.backward()
-    print(emb.weight, emb.weight.grad)
-    print(emb.additional_embedding, emb.additional_embedding.grad)
-
-    lin = DecoupledLinear(in_features=3, out_features=4, out_additional_features=2, bias=True, partially_freeze=True)
-    for n, p in lin.named_parameters():
-        print(n, p.requires_grad)
-    x = torch.randn(12, 3)
-    y = lin(x)
-    loss = y.sum()
-    loss.backward()
-    print("Weight w and grad:", lin.weight, lin.weight.grad)
-    print("bias w and grad:", lin.bias, lin.bias.grad)
-    print("additional_fc.weight w and grad:", lin.additional_fc.weight, lin.additional_fc.weight.grad)
-    print("additional_bias w and grad:", lin.additional_fc.bias, lin.additional_fc.bias.grad)
diff --git a/m4/models/perceiver/perceiver.py b/m4/models/perceiver/perceiver.py
deleted file mode 100644
index 35238c266765e0545dd13a49412b55b2b4ca4323..0000000000000000000000000000000000000000
--- a/m4/models/perceiver/perceiver.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""
-perceiver.py
-Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
-time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents!
-Note that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here
-to prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use
-that to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.
-References:
-    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
-    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch
-"""
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-from einops import rearrange, repeat
-
-
-class PerceiverResampler(nn.Module):
-    def __init__(self, config, embed_dim: int, depth: int, n_heads: int, head_dim: int, n_latents: int) -> None:
-        """
-        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
-        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
-        returns a Tensor of shape [bsz, n_latents, embed_dim].
-        :param embed_dim: Dimensionality of embeddings being fed to the Perceiver Resampler (also dimensionality of
-                          latent embeddings *returned* by the Perceiver Resampler. Could be e.g., VIT embed_dim, ResNet
-                          pool dim, and so on.
-        :param depth: Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
-        :param n_heads: Number of heads in each Transformer block (for multi-headed self-attention).
-        :param head_dim: Dimensionality of each head projection in the Transformer block.
-        :param n_latents: Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
-        """
-        super().__init__()
-        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = embed_dim, n_heads, head_dim, n_latents
-        self.qk_layer_norms = config.qk_layer_norms_perceiver
-
-        # Create Latents for Perceiver
-        self.latents = nn.Parameter(torch.randn(self.n_latents, self.embed_dim), requires_grad=True)
-
-        self.intermediate_dim = (
-            self.embed_dim * 4 if not hasattr(config, "vision_embed_dim") else config.vision_embed_dim * 4
-        )
-        # Create Transformer Blocks
-        self.blocks = nn.ModuleList(
-            [
-                nn.ModuleList(
-                    [
-                        PerceiverAttention(self.embed_dim, self.n_heads, self.head_dim, self.qk_layer_norms),
-                        MLP(self.intermediate_dim, config),
-                    ]
-                )
-                for _ in range(depth)
-            ]
-        )
-        self.layer_norm = nn.LayerNorm(self.embed_dim)
-
-    def forward(self, context: torch.Tensor) -> torch.Tensor:
-        """Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
-        latents = repeat(self.latents, "seq embed -> bsz seq embed", bsz=context.shape[0])
-
-        # Feed through Perceiver Attention blocks...
-        for attn, ff in self.blocks:
-            latents = attn(context, latents) + latents
-            latents = ff(latents) + latents
-
-        return self.layer_norm(latents)
-
-
-class PerceiverAttention(nn.Module):
-    def __init__(self, embed_dim: int, n_heads: int, head_dim: int, qk_layer_norms: bool) -> None:
-        """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
-        super().__init__()
-        self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
-        self.qk_layer_norms = qk_layer_norms
-        # Normalization & Scaling
-        self.context_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.latents_layer_norm = nn.LayerNorm(self.embed_dim)
-        if self.qk_layer_norms:
-            self.q_layer_norm = nn.LayerNorm(self.head_dim)
-            self.k_layer_norm = nn.LayerNorm(self.head_dim)
-
-        self.qk_scale = self.head_dim**-0.5
-
-        # Q, K, V Projection (no bias -- detail from Perceiver/Flamingo Papers).
-        self.q_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
-
-        self.output_proj = nn.Linear(self.n_heads * self.head_dim, embed_dim, bias=False)
-
-    def forward(self, context: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
-        """
-        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
-        :param context: Tensor of shape [bsz, seq, embed_dim] representing long-form context to resample.
-        :param latents: Tensor of shape [bsz, n_latents, embed_dim] representing fixed length latents to compress to.
-        :return: Tensor of shape [bsz, n_latents, embed_dim] representing attention over latents w/ cross from context.
-        """
-        context = self.context_layer_norm(context)
-        latents = self.latents_layer_norm(latents)
-
-        # Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
-        #   Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
-        q = self.q_proj(latents)
-        k = self.k_proj(torch.cat([context, latents], dim=-2))
-        v = self.v_proj(torch.cat([context, latents], dim=-2))
-
-        # Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
-        #   =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
-        q, k, v = [rearrange(x, "bsz seq (heads embed) -> bsz heads seq embed", heads=self.n_heads) for x in (q, k, v)]
-        if self.qk_layer_norms:
-            q = self.q_layer_norm(q)
-            k = self.k_layer_norm(k)
-
-        scores = torch.einsum("... i d, ... j d -> ... i j", q * self.qk_scale, k)
-        stabilized_scores = scores - (scores.amax(dim=-1, keepdim=True).detach())
-        attn = stabilized_scores.softmax(dim=-1)
-
-        # Attend & project back to output...
-        resampled = torch.einsum("... i j, ... j d -> ... i d", attn, v)
-        return self.output_proj(
-            rearrange(resampled, "bsz heads seq embed -> bsz seq (heads embed)", heads=self.n_heads)
-        )
-
-
-class MLP(nn.Module):
-    def __init__(self, intermediate_size, config):
-        """Simple MLP block with intermediate_size and embedding size"""
-        super().__init__()
-        self.embed_dim = config.vision_embed_dim
-        self.ln = nn.LayerNorm(self.embed_dim)
-        self.fc = nn.Linear(self.embed_dim, intermediate_size, bias=False)
-        self.act = nn.ReLU()
-        self.c_proj = nn.Linear(intermediate_size, self.embed_dim, bias=False)
-
-    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
-        hidden_states = self.ln(hidden_states)
-        hidden_states = self.fc(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.c_proj(hidden_states)
-
-        return hidden_states
diff --git a/m4/models/vbloom/__init__.py b/m4/models/vbloom/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/m4/models/vbloom/configuration_vbloom.py b/m4/models/vbloom/configuration_vbloom.py
deleted file mode 100644
index 24e038f2be7ce2f8521f140098b1f5415b708100..0000000000000000000000000000000000000000
--- a/m4/models/vbloom/configuration_vbloom.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# coding=utf-8
-# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" VBloom configuration"""
-import os
-from typing import Tuple, Union
-
-from transformers import AutoConfig
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "bigscience/bloom": "https://huggingface.co/bigscience/bloom/resolve/main/config.json",
-    "bigscience/bloom-560m": "https://huggingface.co/bigscience/bloom-560m/blob/main/config.json",
-    "bigscience/bloom-1b1": "https://huggingface.co/bigscience/bloom-1b1/blob/main/config.json",
-    "bigscience/bloom-1b7": "https://huggingface.co/bigscience/bloom-1b7/blob/main/config.json",
-    "bigscience/bloom-3b": "https://huggingface.co/bigscience/bloom-3b/blob/main/config.json",
-    "bigscience/bloom-7b1": "https://huggingface.co/bigscience/bloom-7b1/blob/main/config.json",
-}
-
-
-class VBloomConfig(PretrainedConfig):
-    """
-    This is the configuration class to store the configuration of a [`BloomModel`]. It is used to instantiate a Bloom
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to the Bloom architecture
-    [bigscience/bloom](https://huggingface.co/bigscience/bloom).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    TODO: this doc is completely out of sync with the actual args
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 50257):
-            Vocabulary size of the Bloom model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`BloomModel`].
-        additional_vocab_size (`int`, *optional`, defaults to 0):
-            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
-            are always trainable whereas regular vocab tokens can be frozen or not.
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the embeddings and hidden states.
-        n_layer (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        n_head (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        attn_pdrop (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon to use in the layer normalization layers.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        alpha_initializer (`str`, *optional*, defaults to `"ones"`):
-            Initialization type for the alphas.
-        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
-            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross Attention.
-        alpha_type (`str`, *optional*, defaults to `"vector"`):
-            Whether the gating alphas should be vectors or single floats.
-        apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`):
-            If enabled, use the layer norm of the hidden states as the residual in the transformer blocks
-        skip_bias_add (`bool`, *optional*, defaults to `True`):
-            If set to `True`, it will skip bias add for each linear layer in the transformer blocks
-        skip_bias_add_qkv (`bool`, *optional*, defaults to `False`):
-            If set to `True`, it will skip bias add for the first linear layer in the transformer blocks
-        hidden_dropout (`float`, *optional*, defaults to 0.1):
-            Dropout rate of the dropout function on the bias dropout.
-        attention_dropout (`float`, *optional*, defaults to 0.1):
-            Dropout rate applied to the attention probs
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-        pretraining_tp (`int`, *optional*, defaults to `1`):
-            Experimental feature. Tensor parallelism rank used during pretraining with Megatron. Please refer to [this
-            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
-            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232). Note also that this is enabled only when
-            `slow_but_exact=True`.
-        slow_but_exact (`bool`, *optional*, defaults to `False`):
-            Experimental feature. Whether to use slow but exact implementation of the attention mechanism. While
-            merging the TP rank tensors, due to slicing operations the results may be slightly different between the
-            model trained on Megatron and our model. Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232). A solution to obtain more accurate results is to
-            enable this feature. Enabling this will hurt the computational time of the inference. Will be probably
-            resolved in the future once the main model has been fine-tuned with TP_rank=1.
-
-    Example:
-
-    ```python
-    >>> from transformers import BloomModel, BloomConfig
-
-    >>> # Initializing a Bloom configuration
-    >>> configuration = BloomConfig()
-
-    >>> # Initializing a model from the configuration
-    >>> model = BloomModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "vbloom"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {
-        "num_hidden_layers": "n_layer",
-        "num_attention_heads": "n_head",
-    }
-
-    def __init__(
-        self,
-        vocab_size=250880,
-        additional_vocab_size=0,
-        hidden_size=64,
-        n_layer=2,
-        n_head=8,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        alpha_initializer="ones",
-        alphas_initializer_range=0.0,
-        alpha_type="vector",
-        use_cache=False,
-        bos_token_id=1,
-        eos_token_id=2,
-        apply_residual_connection_post_layernorm=False,
-        hidden_dropout=0.0,
-        attention_dropout=0.0,
-        pretraining_tp=1,  # TP rank used when training with megatron
-        slow_but_exact=False,
-        cross_layer_interval=1,
-        tie_word_embeddings=False,
-        freeze_text_layers=True,
-        freeze_lm_head=False,
-        freeze_vision_layers=True,
-        vision_model_name="google/vit-base-patch16-224",
-        vision_model_params="{}",
-        vision_embed_dim=768,
-        image_token_index=250880,
-        use_resampler=False,
-        resampler_n_latents=64,
-        resampler_depth=6,
-        resampler_n_heads=16,
-        resampler_head_dim=96,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.additional_vocab_size = additional_vocab_size
-        # Backward compatibility with n_embed kwarg
-        n_embed = kwargs.pop("n_embed", None)
-        self.hidden_size = hidden_size if n_embed is None else n_embed
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.alpha_initializer = alpha_initializer
-        self.alphas_initializer_range = alphas_initializer_range
-        self.alpha_type = alpha_type
-        self.use_cache = use_cache
-        self.pretraining_tp = pretraining_tp
-        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.slow_but_exact = slow_but_exact
-
-        self.cross_layer_interval = cross_layer_interval
-        self.freeze_vision_layers = freeze_vision_layers
-        self.vision_model_name = vision_model_name
-        self.vision_model_params = vision_model_params
-
-        self.tie_word_embeddings = tie_word_embeddings
-        self.freeze_text_layers = freeze_text_layers
-        self.freeze_lm_head = freeze_lm_head
-        self.image_token_index = image_token_index
-
-        self.vision_embed_dim = vision_embed_dim
-
-        # Resampler params
-        self.use_resampler = use_resampler
-        self.resampler_n_latents = resampler_n_latents
-        self.resampler_depth = resampler_depth
-        self.resampler_n_heads = resampler_n_heads
-        self.resampler_head_dim = resampler_head_dim
-
-        # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
-        # PretrainedConfig.from_dict first instantiates the class with the config dict and only then
-        # updates the config object with `kwargs` from from_pretrained, so during the instantiation
-        # of this object many attributes have default values and haven't yet been overridden.
-        # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.
-
-        super().__init__(
-            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
-        )
-
-    def check_compatibilities(self):
-        if self.tie_word_embeddings and (self.freeze_text_layers != self.freeze_lm_head):
-            raise ValueError(
-                "if `tie_word_embeddings` is True, then `freeze_lm_head` and `freeze_text_layers` must be equal."
-            )
-
-        vision_model_params = eval(self.vision_model_params)
-        config = AutoConfig.from_pretrained(self.vision_model_name, **vision_model_params)
-        if hasattr(config, "vision_config"):
-            vison_config = config.vision_config
-        else:
-            vison_config = config
-        vision_embed_dim = vison_config.hidden_size
-        if self.vision_embed_dim != vision_embed_dim:
-            raise ValueError(
-                f"vision_embed_dim ({self.vision_embed_dim}) must match the hidden size of the vision model"
-                f" ({vision_embed_dim})"
-            )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        outputs = super(VBloomConfig, cls).from_pretrained(pretrained_model_name_or_path, **kwargs)
-        if isinstance(outputs, Tuple):
-            # When called with return_unused_kwargs=True, the first item will be the config
-            outputs[0].check_compatibilities()
-        else:
-            outputs.check_compatibilities()
-        return outputs
diff --git a/m4/models/vbloom/modeling_vbloom.py b/m4/models/vbloom/modeling_vbloom.py
deleted file mode 100644
index 9a8efd49696168c61f7ae1bbf188b75a6277c2fe..0000000000000000000000000000000000000000
--- a/m4/models/vbloom/modeling_vbloom.py
+++ /dev/null
@@ -1,1396 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc. team and BigScience workshop.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BLOOM model."""
-
-import math
-import warnings
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss, LayerNorm
-from torch.nn import functional as F
-from transformers.file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-)
-from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
-
-from m4.models import DecoupledEmbedding, DecoupledLinear
-from m4.models.common import (
-    expand_inputs_for_generation,
-    prepare_inputs_for_generation,
-    update_model_kwargs_for_generation,
-)
-from m4.models.custom_modules import VLOOMPreTrainedModelBase
-from m4.models.perceiver.perceiver import PerceiverResampler
-from m4.models.vbloom.configuration_vbloom import VBloomConfig
-from m4.training.utils import (
-    compute_perceiver_tflops_per_batch_per_gpu,
-    compute_tflops_per_batch_per_gpu,
-    freeze_model,
-)
-from m4.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "bigscience/bloom-560m"
-_CONFIG_FOR_DOC = "VBloomConfig"
-_TOKENIZER_FOR_DOC = "BloomTokenizerFast"
-
-BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "bigscience/bigscience-small-testing",
-    "bigscience/bloom-560m",
-    "bigscience/bloom-1b1",
-    "bigscience/bloom-1b7",
-    "bigscience/bloom-3b",
-    "bigscience/bloom-7b1",
-    "bigscience/bloom",
-]
-
-
-def _make_causal_mask(
-    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
-) -> torch.BoolTensor:
-    """
-    Make causal mask used for self-attention.
-    """
-    batch_size, target_length = input_ids_shape
-    mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device)
-    # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround
-    seq_ids = torch.arange(target_length, device=device)
-    mask[:, past_key_values_length:] = seq_ids[:, None] < seq_ids[None, :]
-
-    if past_key_values_length > 0:
-        mask[:, :past_key_values_length] = False
-
-    expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
-    return expanded_mask
-
-
-def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
-    """
-    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
-    """
-    batch_size, src_length = mask.shape
-    tgt_length = tgt_length if tgt_length is not None else src_length
-
-    expanded_mask = ~(mask[:, None, None, :].to(torch.bool))
-    return expanded_mask.expand(batch_size, 1, tgt_length, src_length)
-
-
-def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
-    """
-    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
-    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
-    `softmax(l+a) = softmax(l)`. Based on
-    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
-    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
-
-    Args:
-    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
-        attention_mask (`torch.Tensor`):
-            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
-        num_heads (`int`, *required*):
-            number of heads
-        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
-            dtype of the output tensor
-    """
-    batch_size, seq_length = attention_mask.shape
-    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-    base = torch.tensor(
-        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-    )
-    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != num_heads:
-        extra_base = torch.tensor(
-            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-        )
-        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
-        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-
-    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
-    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
-    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
-    # => the query_length dimension will then be broadcasted correctly
-    # This is more or less identical to T5's relative position bias:
-    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
-    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
-    alibi = slopes[..., None] * arange_tensor
-    return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
-
-
-def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
-    """
-    Dropout add function
-
-    Args:
-        x (`torch.tensor`, *required*):
-            input tensor
-        residual (`torch.tensor`, *required*):
-            esidual tensor
-        prob (`float`, *required*):
-            dropout probability
-        training (`bool`, *required*):
-            training mode
-    """
-    out = F.dropout(x, p=prob, training=training)
-    out = residual + out
-    return out
-
-
-def bloom_gelu_forward(x: torch.Tensor) -> torch.Tensor:
-    """
-    Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
-    make the model jitable.
-
-    Args:
-        x (`torch.tensor`, *required*):
-            input hidden states
-    """
-    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
-
-
-def bloom_gelu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
-    """
-    gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) +
-    0.3989423 * x * torch.exp(-0.5 * x * x)
-
-    Args:
-        g (`torch.tensor`, *required*):
-            gradient output tensor
-        x (`torch.tensor`, *required*):
-            input tensor
-    """
-    x = x[0]  # x is a tuple of 1 element, needs to unpack it first
-    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
-    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
-    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
-    return ff * g
-
-
-class GeLUFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, input: torch.Tensor) -> torch.Tensor:
-        ctx.save_for_backward(input)
-        return bloom_gelu_forward(input)
-
-    @staticmethod
-    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
-        input = ctx.saved_tensors
-        tmp = bloom_gelu_back(grad_output, input)
-        return tmp
-
-
-class BloomGelu(nn.Module):
-    """
-    BloomBiasGelu wrapper function that make use of the simple function on inference mode to make the model
-    torchscriptable and use the autograd function in training mode to get the accurate results of the gradients Partly
-    copied from Megatron-DeepSpeed code and adapted for our needs
-
-    See here why autograd functions are not torchscriptable: https://github.com/pytorch/pytorch/issues/22329
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if self.training:
-            return GeLUFunction.apply(x)
-        else:
-            return bloom_gelu_forward(x)
-
-
-class BloomAttention(nn.Module):
-    def __init__(self, config: VBloomConfig, is_cross_attention=False):
-        super().__init__()
-
-        self.pretraining_tp = config.pretraining_tp
-        self.slow_but_exact = config.slow_but_exact
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.n_head
-        self.head_dim = self.hidden_size // self.num_heads
-        self.split_size = self.hidden_size
-        self.hidden_dropout = config.hidden_dropout
-
-        if self.head_dim * self.num_heads != self.hidden_size:
-            raise ValueError(
-                f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-
-        # Layer-wise attention scaling
-        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
-        self.beta = 1.0
-
-        self.is_cross_attention = is_cross_attention
-
-        if self.is_cross_attention:
-            self.query = nn.Linear(self.hidden_size, 1 * self.hidden_size, bias=True)
-            kv_input_dim = self.hidden_size if not hasattr(config, "vision_embed_dim") else config.vision_embed_dim
-            self.key_value = nn.Linear(kv_input_dim, 2 * self.hidden_size, bias=True)
-        else:
-            self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=True)
-
-        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
-        self.attention_dropout = nn.Dropout(config.attention_dropout)
-
-        if self.is_cross_attention:
-            # The alpha stuff
-            self.act = nn.Tanh()
-
-            if config.alpha_initializer == "zeros":
-                if config.alpha_type == "vector":
-                    self.alpha_cross_attn = nn.Parameter(torch.zeros(1, 1, self.hidden_size))
-                elif config.alpha_type == "float":
-                    self.alpha_cross_attn = nn.Parameter(torch.zeros(1))
-                else:
-                    raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-            elif config.alpha_initializer == "ones":
-                if config.alpha_type == "vector":
-                    self.alpha_cross_attn = nn.Parameter(torch.ones(1, 1, self.hidden_size))
-                elif config.alpha_type == "float":
-                    self.alpha_cross_attn = nn.Parameter(torch.ones(1))
-                else:
-                    raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-            elif config.alpha_initializer in {"normal", "gaussian", "random"}:
-                if config.alpha_type == "vector":
-                    self.alpha_cross_attn = nn.Parameter(
-                        torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, self.hidden_size))
-                    )
-                elif config.alpha_type == "float":
-                    self.alpha_cross_attn = nn.Parameter(
-                        torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1))
-                    )
-                else:
-                    raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-            else:
-                raise NotImplementedError(
-                    f"Alpha initialization scheme {config.alpha_initializer} not yet implemented!"
-                )
-
-    def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
-        storage as `fused_qkv`
-
-        Args:
-            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
-
-        Returns:
-            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
-            value: [batch_size, seq_length, num_heads, head_dim]
-        """
-        batch_size, seq_length, n_times_hidden_size = fused_qkv.shape
-        n = int(n_times_hidden_size / self.hidden_size)
-        fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, n, self.head_dim)
-        outputs = ()
-        for i in range(n):
-            outputs += (fused_qkv[..., i, :],)
-        return outputs
-
-    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Merge heads together over the last dimenstion
-
-        Args:
-            x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
-
-        Returns:
-            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
-        """
-        # What we want to achieve is:
-        # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
-        batch_size_and_num_heads, seq_length, _ = x.shape
-        batch_size = batch_size_and_num_heads // self.num_heads
-
-        # First view to decompose the batch size
-        # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
-        x = x.view(batch_size, self.num_heads, seq_length, self.head_dim)
-
-        # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
-        x = x.permute(0, 2, 1, 3)
-
-        # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
-        return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        residual: torch.Tensor,
-        alibi: torch.Tensor,
-        attention_mask: torch.Tensor,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        if not self.is_cross_attention:
-            fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
-
-            # 3 x [batch_size, seq_length, num_heads, head_dim]
-            (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
-        else:
-            if encoder_hidden_states is not None:
-                attention_mask = encoder_attention_mask
-            q = self.query(hidden_states)
-            kv = self.key_value(encoder_hidden_states)
-
-            query_layer = self._split_heads(q)[0]
-            key_layer, value_layer = self._split_heads(kv)
-
-        batch_size, q_length, _, _ = query_layer.shape
-        _, kv_length, _, _ = key_layer.shape
-
-        query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
-        key_layer = key_layer.permute(0, 2, 3, 1).reshape(batch_size * self.num_heads, self.head_dim, kv_length)
-        value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, kv_length, self.head_dim)
-        if layer_past is not None:
-            past_key, past_value = layer_past
-            # concatenate along seq_length dimension:
-            #  - key: [batch_size * self.num_heads, head_dim, kv_length]
-            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
-            key_layer = torch.cat((past_key, key_layer), dim=2)
-            value_layer = torch.cat((past_value, value_layer), dim=1)
-            _, _, kv_length = key_layer.shape
-
-        if use_cache is True:
-            present = (key_layer, value_layer)
-        else:
-            present = None
-
-        # [batch_size * num_heads, q_length, kv_length]
-        # we use `torch.Tensor.baddbmm` instead of `torch.baddbmm` as the latter isn't supported by TorchScript v1.11
-        if alibi is None:
-            alibi = torch.empty(
-                batch_size * self.num_heads, q_length, kv_length, dtype=query_layer.dtype, device=query_layer.device
-            )
-
-        matmul_result = alibi.baddbmm(
-            batch1=query_layer,
-            batch2=key_layer,
-            beta=0.0 if self.is_cross_attention else self.beta,
-            alpha=self.inv_norm_factor,
-        )
-
-        # change view to [batch_size, num_heads, q_length, kv_length]
-        attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length)
-
-        # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
-        input_dtype = attention_scores.dtype
-        # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
-        if input_dtype == torch.float16:
-            attention_scores = attention_scores.to(torch.float)
-        attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
-        attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(input_dtype)
-
-        # [batch_size, num_heads, q_length, kv_length]
-        attention_probs = self.attention_dropout(attention_probs)
-
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        # change view [batch_size x num_heads, q_length, kv_length]
-        attention_probs_reshaped = attention_probs.view(batch_size * self.num_heads, q_length, kv_length)
-
-        # matmul: [batch_size * num_heads, q_length, head_dim]
-        context_layer = torch.bmm(attention_probs_reshaped, value_layer)
-
-        # change view [batch_size, num_heads, q_length, head_dim]
-        context_layer = self._merge_heads(context_layer)
-
-        # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
-        if self.pretraining_tp > 1 and self.slow_but_exact:
-            slices = self.hidden_size / self.pretraining_tp
-            output_tensor = torch.zeros_like(context_layer)
-            for i in range(self.pretraining_tp):
-                output_tensor = output_tensor + F.linear(
-                    context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
-                    self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
-                )
-        else:
-            output_tensor = self.dense(context_layer)
-
-        if not self.is_cross_attention:
-            output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
-        else:
-            output_tensor = dropout_add(
-                self.act(self.alpha_cross_attn) * output_tensor, residual, self.hidden_dropout, self.training
-            )
-
-        outputs = (output_tensor, present)
-        if output_attentions:
-            outputs += (attention_probs,)
-
-        return outputs
-
-
-class BloomMLP(nn.Module):
-    def __init__(self, config: VBloomConfig, is_gated=False):
-        super().__init__()
-        hidden_size = config.hidden_size
-
-        self.pretraining_tp = config.pretraining_tp
-        self.slow_but_exact = config.slow_but_exact
-        self.dense_h_to_4h = nn.Linear(hidden_size, 4 * hidden_size)
-        self.gelu_impl = BloomGelu()
-        self.dense_4h_to_h = nn.Linear(4 * hidden_size, hidden_size)
-        self.hidden_dropout = config.hidden_dropout
-
-        # The alpha stuff
-        self.is_gated = is_gated
-        if is_gated:
-            self.act = nn.Tanh()
-
-            if config.alpha_initializer == "zeros":
-                if config.alpha_type == "vector":
-                    self.alpha_dense = nn.Parameter(torch.zeros(1, 1, hidden_size))
-                elif config.alpha_type == "float":
-                    self.alpha_dense = nn.Parameter(torch.zeros(1))
-                else:
-                    raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-            elif config.alpha_initializer == "ones":
-                if config.alpha_type == "vector":
-                    self.alpha_dense = nn.Parameter(torch.ones(1, 1, hidden_size))
-                elif config.alpha_type == "float":
-                    self.alpha_dense = nn.Parameter(torch.ones(1))
-                else:
-                    raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-            elif config.alpha_initializer in {"normal", "gaussian", "random"}:
-                if config.alpha_type == "vector":
-                    self.alpha_dense = nn.Parameter(
-                        torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, hidden_size))
-                    )
-                elif config.alpha_type == "float":
-                    self.alpha_dense = nn.Parameter(
-                        torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1))
-                    )
-                else:
-                    raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-            else:
-                raise NotImplementedError(
-                    f"Alpha initialization scheme {config.alpha_initializer} not yet implemented!"
-                )
-
-    def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.gelu_impl(self.dense_h_to_4h(hidden_states))
-
-        if self.pretraining_tp > 1 and self.slow_but_exact:
-            intermediate_output = torch.zeros_like(residual)
-            slices = self.dense_4h_to_h.weight.shape[-1] / self.pretraining_tp
-            for i in range(self.pretraining_tp):
-                intermediate_output = intermediate_output + F.linear(
-                    hidden_states[:, :, int(i * slices) : int((i + 1) * slices)],
-                    self.dense_4h_to_h.weight[:, int(i * slices) : int((i + 1) * slices)],
-                )
-        else:
-            intermediate_output = self.dense_4h_to_h(hidden_states)
-
-        if not self.is_gated:
-            output = dropout_add(intermediate_output, residual, self.hidden_dropout, self.training)
-        else:
-            output = dropout_add(
-                self.act(self.alpha_dense) * intermediate_output, residual, self.hidden_dropout, self.training
-            )
-
-        return output
-
-
-class BloomBlock(nn.Module):
-    def __init__(self, config: VBloomConfig):
-        super().__init__()
-        hidden_size = config.hidden_size
-
-        self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.num_heads = config.n_head
-        self.self_attention = BloomAttention(config)
-        self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-
-        self.mlp = BloomMLP(config)
-
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
-        self.hidden_dropout = config.hidden_dropout
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        alibi: torch.Tensor,
-        attention_mask: torch.Tensor,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        # hidden_states: [batch_size, seq_length, hidden_size]
-
-        # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
-
-        # Layer norm post the self attention.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = hidden_states
-
-        # Self attention.
-        attn_outputs = self.self_attention(
-            layernorm_output,
-            residual,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            alibi=alibi,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-
-        attention_output = attn_outputs[0]
-
-        outputs = attn_outputs[1:]
-
-        layernorm_output = self.post_attention_layernorm(attention_output)
-
-        # Get residual
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = attention_output
-
-        # MLP.
-        output = self.mlp(layernorm_output, residual)
-
-        if use_cache:
-            outputs = (output,) + outputs
-        else:
-            outputs = (output,) + outputs[1:]
-
-        return outputs  # hidden_states, present, attentions
-
-
-class VBloomGatedCrossAttentionBlock(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        hidden_size = config.hidden_size
-
-        self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.num_heads = config.n_head
-        self.cross_attention = BloomAttention(config, is_cross_attention=True)
-        self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-
-        self.gated_mlp = BloomMLP(config, is_gated=True)
-
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
-        self.hidden_dropout = config.hidden_dropout
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        image_hidden_states: Optional[torch.Tensor] = None,
-        image_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-        # hidden_states: [batch_size, seq_length, hidden_size]
-
-        # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
-
-        # Layer norm post the self attention.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = hidden_states
-
-        # Self attention.
-        attn_outputs = self.cross_attention(
-            layernorm_output,
-            residual,
-            alibi=None,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=image_hidden_states,
-            encoder_attention_mask=image_attention_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-
-        attention_output = attn_outputs[0]
-
-        outputs = attn_outputs[1:]
-
-        layernorm_output = self.post_attention_layernorm(attention_output)
-
-        # Get residual
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = attention_output
-
-        # MLP.
-        output = self.gated_mlp(layernorm_output, residual)
-
-        if use_cache:
-            outputs = (output,) + outputs
-        else:
-            outputs = (output,) + outputs[1:]
-
-        return outputs  # hidden_states, present, attentions
-
-
-class VBloomPreTrainedModel(VLOOMPreTrainedModelBase):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = VBloomConfig
-    base_model_prefix = "transformer"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["BloomBlock"]
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
-        if isinstance(module, VBloomModel):
-            module.gradient_checkpointing = value
-
-    @classmethod
-    def override_vision_model_wrapper(cls, model, config, vision_model_name, vision_model_params, torch_dtype):
-        # this can be called via from_pretrained from a class w/ head or w/o head so we extract the beheaded model version
-        beheaded_model = model.transformer if hasattr(model, "transformer") else model
-        cls.override_vision_model(beheaded_model, vision_model_name, vision_model_params, torch_dtype)
-        beheaded_model.freeze_relevant_params(config)
-
-
-BLOOM_START_DOCSTRING = r"""
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`BloomConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-BLOOM_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
-            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
-            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
-
-            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
-            `input_ids`.
-
-            Indices can be obtained using [`BloomTokenizerFast`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
-            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
-            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
-            their past given to this model should not be passed as `input_ids` as they have already been computed.
-
-            Each element of `past_key_values` is a tuple (past_key, past_value):
-            - past_key: [batch_size * num_heads, head_dim, kv_length]
-            - past_value: [batch_size * num_heads, kv_length, head_dim]
-        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-
-            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
-            `past_key_values`).
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.",
-    BLOOM_START_DOCSTRING,
-)
-class VBloomModel(VBloomPreTrainedModel):
-    def __init__(self, config: VBloomConfig, vision_model=None):
-        super().__init__(config)
-
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.n_head
-
-        # Embedding + LN Embedding
-        self.word_embeddings = DecoupledEmbedding(
-            num_embeddings=config.vocab_size,
-            num_additional_embeddings=config.additional_vocab_size,
-            embedding_dim=self.embed_dim,
-            partially_freeze=config.freeze_text_layers,
-        )
-        self.word_embeddings_layernorm = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-        # Transformer blocks
-        self.h = nn.ModuleList([BloomBlock(config) for _ in range(config.num_hidden_layers)])
-
-        # Final Layer Norm
-        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-        self.cross_layer_interval = config.cross_layer_interval
-        num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
-        self.gated_cross_attn_layers = nn.ModuleList(
-            [VBloomGatedCrossAttentionBlock(config) for i in range(num_cross_layers)]
-        )
-
-        # Perceiver Resampler
-        if config.use_resampler:
-            self.perceiver_resampler = PerceiverResampler(
-                self.config,
-                self.config.vision_embed_dim,
-                config.resampler_depth,
-                config.resampler_n_heads,
-                config.resampler_head_dim,
-                config.resampler_n_latents,
-            )
-        self.gradient_checkpointing = False
-
-        # Load an uninitialized model and later in from_pretrained will load the pre-trained model -
-        # this solves the losing of weights in `from_pretrained` on the main model
-        self.vision_model = vision_model
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        self.freeze_relevant_params(config)
-
-    def freeze_relevant_params(self, config=None):
-        if config is None:
-            config = self.config
-
-        if config.freeze_text_layers:
-            self.freeze_text_layers()
-
-        if config.freeze_vision_layers:
-            freeze_model(self.vision_model)
-
-    def freeze_text_layers(self):
-        for module in [self.word_embeddings_layernorm, self.h, self.ln_f]:
-            freeze_model(module)
-
-    def get_input_embeddings(self):
-        return self.word_embeddings
-
-    def _prepare_attn_mask(
-        self, attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int
-    ) -> torch.BoolTensor:
-        # create causal mask
-        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-        combined_attention_mask = None
-        device = attention_mask.device
-        _, src_length = input_shape
-
-        if src_length > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape, device=device, past_key_values_length=past_key_values_length
-            )
-
-        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-        expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
-        combined_attention_mask = (
-            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
-        )
-
-        return combined_attention_mask
-
-    def set_input_embeddings(self, new_embeddings: torch.Tensor):
-        self.word_embeddings = new_embeddings
-
-    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPastAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_embeddings: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        crossblock_head_mask: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                (
-                    "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely"
-                    " ignore passing `position_ids`."
-                ),
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if past_key_values is None:
-            past_key_values = tuple([None] * len(self.h))
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape batch_size x num_heads x N x N
-        # head_mask has shape n_layer x batch x num_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-
-        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-
-        # Compute alibi tensor: check build_alibi_tensor documentation
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-        if past_key_values[0] is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-
-        alibi = build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype)
-
-        causal_mask = self._prepare_attn_mask(
-            attention_mask,
-            input_shape=(batch_size, seq_length),
-            past_key_values_length=past_key_values_length,
-        )
-
-        if pixel_values is not None and image_embeddings is not None:
-            raise ValueError("You cannot specify both pixel_values and image_embeddings at the same time")
-        elif pixel_values is not None:
-            pixel_values = pixel_values.to(dtype=self.dtype, device=input_ids.device)  # fp16 compatibility
-            batch_size, num_images = pixel_values.size(0), pixel_values.size(1)
-            pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
-            # Get sequence from the vision encoder
-            image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state
-        elif image_embeddings is not None:
-            batch_size, num_images, image_seq_len, image_hidden_size = image_embeddings.size()
-            image_hidden_states = image_embeddings.to(dtype=self.dtype, device=input_ids.device)
-            image_hidden_states = image_hidden_states.view(batch_size * num_images, image_seq_len, image_hidden_size)
-
-        if self.config.use_resampler:
-            image_hidden_states = self.perceiver_resampler(image_hidden_states)
-        image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2)
-        image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size)
-        # Make image_attention_mask compatible with hidden states
-        text_seq_len = image_attention_mask.size(1)
-        image_attention_mask = image_attention_mask.unsqueeze(
-            -1
-        )  # TODO: something i don't understand here. why are the few last tokens not attending when there is just a single image?
-        image_attention_mask = image_attention_mask.repeat(1, 1, 1, image_seq_len)
-        image_attention_mask = image_attention_mask.view(batch_size, text_seq_len, num_images * image_seq_len)
-
-        if image_hidden_states is not None:
-            image_batch_size, image_sequence_length, _ = image_hidden_states.size()
-            image_hidden_shape = (image_batch_size, image_sequence_length)
-            if image_attention_mask is None:
-                image_attention_mask = torch.ones(image_hidden_shape, device=hidden_states.device)
-            # image_attention_mask = self.invert_attention_mask(image_attention_mask)
-            image_attention_mask = image_attention_mask.to(torch.bool)
-            image_attention_mask = image_attention_mask[:, None, :, :]
-        else:
-            image_attention_mask = None
-
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            def vblock(
-                main_block,
-                hidden_states,
-                alibi,
-                layer_past,
-                attention_mask,
-                layer_head_mask,
-                use_cache,
-                output_attentions,
-                image_hidden_states,
-                image_attention_mask,
-                layer_idx,
-                cross_layer_interval,
-                gated_cross_attn_layers,
-            ):
-                if layer_idx % cross_layer_interval == 0:
-                    xblock = gated_cross_attn_layers[layer_idx // cross_layer_interval]
-                    outputs = xblock(
-                        hidden_states,
-                        attention_mask=attention_mask,
-                        image_hidden_states=image_hidden_states,
-                        image_attention_mask=image_attention_mask,
-                        use_cache=use_cache,
-                        output_attentions=output_attentions,
-                    )
-                    hidden_states = outputs[0]
-
-                outputs = main_block(
-                    hidden_states,
-                    alibi=alibi,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    head_mask=layer_head_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-                return outputs
-
-            if self.gradient_checkpointing and self.training:
-                layer_past = None
-                if use_cache:
-                    logger.warning_once(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    vblock,
-                    block,
-                    hidden_states,
-                    alibi,
-                    layer_past,
-                    causal_mask,
-                    head_mask[i],
-                    use_cache,
-                    output_attentions,
-                    image_hidden_states,
-                    image_attention_mask,
-                    i,
-                    self.cross_layer_interval,
-                    self.gated_cross_attn_layers,
-                )
-            else:
-                outputs = vblock(
-                    block,
-                    hidden_states,
-                    alibi=alibi,
-                    layer_past=layer_past,
-                    attention_mask=causal_mask,
-                    layer_head_mask=head_mask[i],
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    image_hidden_states=image_hidden_states,
-                    image_attention_mask=image_attention_mask,
-                    layer_idx=i,
-                    cross_layer_interval=self.cross_layer_interval,
-                    gated_cross_attn_layers=self.gated_cross_attn_layers,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-
-        # Add last hidden state
-        hidden_states = self.ln_f(hidden_states)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
-    embeddings).
-    """,
-    BLOOM_START_DOCSTRING,
-)
-class VBloomForCausalLM(VBloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
-    def __init__(self, config: VBloomConfig, vision_model=None):
-        super().__init__(config)
-        self.transformer = VBloomModel(config, vision_model=vision_model)
-        self.lm_head = DecoupledLinear(
-            in_features=config.hidden_size,
-            out_features=config.vocab_size,
-            out_additional_features=config.additional_vocab_size,
-            bias=False,
-            partially_freeze=config.freeze_lm_head,
-        )
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings: torch.Tensor):
-        self.lm_head = new_embeddings
-
-    def tie_weights(self):
-        """
-        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
-        """
-        output_embeddings = self.get_output_embeddings()
-        input_embeddings = self.get_input_embeddings()
-
-        if getattr(self.config, "tie_word_embeddings", True):
-            output_embeddings.weight = input_embeddings.weight
-            if input_embeddings.num_additional_embeddings > 0:
-                assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
-                output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
-
-        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
-            output_embeddings.out_features = input_embeddings.num_embeddings
-            if hasattr(output_embeddings, "out_additional_features") and hasattr(
-                input_embeddings, "num_additional_embeddings"
-            ):
-                output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
-
-    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
-        inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
-        unwanted_kwargs = ["position_ids", "token_type_ids"]
-        for kwarg in unwanted_kwargs:
-            inputs.pop(kwarg, None)
-        return inputs
-
-    @staticmethod
-    def _expand_inputs_for_generation(
-        *args,
-        **model_kwargs,
-    ):
-        return expand_inputs_for_generation(*args, **model_kwargs)
-
-    @staticmethod
-    def _update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
-        return update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder)
-
-    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=CausalLMOutputWithCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_embeddings: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        crossblock_head_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                (
-                    "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely"
-                    " ignore passing `position_ids`."
-                ),
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            pixel_values=pixel_values,
-            image_embeddings=image_embeddings,
-            image_attention_mask=image_attention_mask,
-            crossblock_head_mask=crossblock_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
-                shift_logits = lm_logits[..., :-1, :][shift_attention_mask != 0].contiguous()
-                shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
-            else:
-                shift_logits = lm_logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-        past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        batch_size_times_num_heads, head_dim, seq_length = past[0][0].shape
-        batch_size = len(beam_idx)
-        num_heads = batch_size_times_num_heads // batch_size
-        # Get a copy of `beam_idx` on all the devices where we need those indices.
-        device_to_beam_idx = {
-            past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
-        }
-        # key: layer_past[0] [batch_size * num_heads, head_dim, seq_length]
-        # value: layer_past[1] [batch_size * num_heads, seq_length, head_dim]
-        return tuple(
-            (
-                layer_past[0]
-                .view(batch_size, num_heads, head_dim, seq_length)
-                .index_select(0, device_to_beam_idx[layer_past[0].device])
-                .view(batch_size_times_num_heads, head_dim, seq_length),
-                layer_past[1]
-                .view(batch_size, num_heads, seq_length, head_dim)
-                .index_select(0, device_to_beam_idx[layer_past[0].device])
-                .view(batch_size_times_num_heads, seq_length, head_dim),
-            )
-            for layer_past in past
-        )
-
-    def get_model_tflops_per_batch_per_gpu(self, hparams, data_param, tokenizer, max_num_images):
-        config_vl_model = self.config
-
-        language_embed_size = config_vl_model.hidden_size
-        vision_config = self.transformer.vision_model.config
-        num_language_layers = config_vl_model.n_layer
-        ffn_inner_size = 4 * config_vl_model.hidden_size
-
-        # Get vision model blocks infos
-        vision_patch_size = vision_config.patch_size
-        vision_hidden_size = vision_config.hidden_size
-        num_vision_layers = vision_config.num_hidden_layers
-        # The +1 is for the CLS token
-        single_image_seq_len = (vision_config.image_size // vision_patch_size) ** 2 + 1
-        vision_exp_factor = vision_config.intermediate_size // vision_hidden_size
-
-        # Get language and cross-att blocks infos
-        num_cross_attn_layers = num_language_layers // config_vl_model.cross_layer_interval
-        language_seq_len = data_param.max_seq_len
-        language_exp_factor = (ffn_inner_size // language_embed_size) if ffn_inner_size is not None else 4
-        cross_att_exp_factor = (ffn_inner_size // language_embed_size) if ffn_inner_size is not None else 4
-        k_v_cross_attn_seq_len = (
-            (self.config.resampler_n_latents * max_num_images)
-            if self.config.use_resampler
-            else (single_image_seq_len * max_num_images)
-        )
-
-        language_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_language_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=language_seq_len,
-            k_seq_len=language_seq_len,
-            hidden_size=language_embed_size,
-            kv_in_dim=language_embed_size,
-            ff_exp_factor=language_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=False,
-            vocab_size=tokenizer.vocab_size,
-            count_backward=True,  # Always True regardless of freezing, because gradients are computed for cross-attentions
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        cross_attention_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_cross_attn_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=language_seq_len,
-            k_seq_len=k_v_cross_attn_seq_len,
-            hidden_size=language_embed_size,
-            kv_in_dim=vision_hidden_size,
-            ff_exp_factor=cross_att_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=False,
-            vocab_size=None,
-            count_backward=True,
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        vision_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_vision_layers,
-            batch_size=hparams.batch_size_per_gpu * max_num_images,
-            q_seq_len=single_image_seq_len,
-            k_seq_len=single_image_seq_len,
-            hidden_size=vision_hidden_size,
-            kv_in_dim=vision_hidden_size,
-            ff_exp_factor=vision_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=False,
-            vocab_size=None,
-            count_backward=not hparams.model_params["freeze_vision_layers"],
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        if self.config.use_resampler:
-            perceiver_tflops_per_batch_per_gpu = compute_perceiver_tflops_per_batch_per_gpu(
-                num_layers=self.config.resampler_depth,
-                batch_size=hparams.batch_size_per_gpu * max_num_images,
-                q_seq_len=self.config.resampler_n_latents,
-                vision_embed_seq_len=single_image_seq_len,
-                q_k_v_input_dim=vision_hidden_size,
-                attention_hidden_size=self.config.resampler_n_heads * self.config.resampler_head_dim,
-                ff_exp_factor=cross_att_exp_factor,
-                count_backward=True,
-                use_grad_checkpointing=hparams.gradient_checkpointing,
-            )
-            flop_count = (
-                language_tflops_per_batch_per_gpu
-                + cross_attention_tflops_per_batch_per_gpu
-                + vision_tflops_per_batch_per_gpu
-                + perceiver_tflops_per_batch_per_gpu
-            )
-        else:
-            flop_count = (
-                language_tflops_per_batch_per_gpu
-                + cross_attention_tflops_per_batch_per_gpu
-                + vision_tflops_per_batch_per_gpu
-            )
-        return flop_count
diff --git a/m4/models/vgpt2/__init__.py b/m4/models/vgpt2/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/m4/models/vgpt2/configuration_vgpt2.py b/m4/models/vgpt2/configuration_vgpt2.py
deleted file mode 100644
index d7f4026d2894a11021b9a11255ddc192d07d14d5..0000000000000000000000000000000000000000
--- a/m4/models/vgpt2/configuration_vgpt2.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" OpenAI GPT-2 configuration"""
-import os
-from typing import Tuple, Union
-
-from transformers import AutoConfig
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json",
-    "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json",
-    "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json",
-    "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json",
-    "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
-}
-
-
-class VGPT2Config(PretrainedConfig):
-    """
-    This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
-    instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the GPT-2
-    [gpt2](https://huggingface.co/gpt2) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    TODO: this doc is completely out of sync with the actual args
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 50257):
-            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
-        additional_vocab_size (`int`, *optional`, defaults to 0):
-            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
-            are always trainable whereas regular vocab tokens can be frozen or not.
-        n_positions (`int`, *optional*, defaults to 1024):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        n_embd (`int`, *optional*, defaults to 768):
-            Dimensionality of the embeddings and hidden states.
-        n_layer (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        n_head (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        n_inner (`int`, *optional*, defaults to None):
-            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
-        activation_function (`str`, *optional*, defaults to `"gelu"`):
-            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
-        resid_pdrop (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (`int`, *optional*, defaults to 0.1):
-            The dropout ratio for the embeddings.
-        attn_pdrop (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon to use in the layer normalization layers.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        alpha_initializer (`str`, *optional*, defaults to `"ones"`):
-            Initialization type for the alphas.
-        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
-            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross Attention.
-        alpha_type (`str`, *optional*, defaults to `"vector"`):
-            Whether the gating alphas should be vectors or single floats.
-        summary_type (`string`, *optional*, defaults to `"cls_index"`):
-            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
-            [`TFGPT2DoubleHeadsModel`].
-
-            Has to be one of the following options:
-
-                - `"last"`: Take the last token hidden state (like XLNet).
-                - `"first"`: Take the first token hidden state (like BERT).
-                - `"mean"`: Take the mean of all tokens hidden states.
-                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
-                - `"attn"`: Not implemented now, use multi-head attention.
-        summary_use_proj (`bool`, *optional*, defaults to `True`):
-            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
-            [`TFGPT2DoubleHeadsModel`].
-
-            Whether or not to add a projection after the vector extraction.
-        summary_activation (`str`, *optional*):
-            Argument used when doing sequence summary. Used in for the multiple choice head in
-            [`GPT2DoubleHeadsModel`].
-
-            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
-        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
-            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
-            [`TFGPT2DoubleHeadsModel`].
-
-            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
-        summary_first_dropout (`float`, *optional*, defaults to 0.1):
-            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
-            [`TFGPT2DoubleHeadsModel`].
-
-            The dropout ratio to be used after the projection and activation.
-        scale_attn_weights (`bool`, *optional*, defaults to `True`):
-            Scale attention weights by dividing by sqrt(hidden_size)..
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
-            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
-        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
-            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
-            dot-product/softmax to float() when training with mixed precision.
-        cross_layer_interval (`int`, *optional*, default to 1)
-            Interval for cross attention (from text to image) layers.
-
-    Example:
-
-    ```python
-    >>> from transformers import GPT2Model, GPT2Config
-
-    >>> # Initializing a GPT2 configuration
-    >>> configuration = GPT2Config()
-
-    >>> # Initializing a model from the configuration
-    >>> model = GPT2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "vgpt2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {
-        "hidden_size": "n_embd",
-        "max_position_embeddings": "n_positions",
-        "num_attention_heads": "n_head",
-        "num_hidden_layers": "n_layer",
-    }
-
-    def __init__(
-        self,
-        vocab_size=50257,
-        additional_vocab_size=0,
-        n_positions=1024,
-        n_embd=768,
-        n_layer=12,
-        n_head=12,
-        n_inner=None,
-        activation_function="gelu_new",
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        alpha_initializer="ones",
-        alphas_initializer_range=0.0,
-        alpha_type="vector",
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        scale_attn_weights=True,
-        use_cache=True,
-        bos_token_id=50256,
-        eos_token_id=50256,
-        scale_attn_by_inverse_layer_idx=False,
-        reorder_and_upcast_attn=False,
-        cross_layer_interval=1,
-        tie_word_embeddings=False,
-        freeze_text_layers=True,
-        freeze_lm_head=False,
-        freeze_vision_layers=True,
-        vision_model_name="google/vit-base-patch16-224",
-        vision_model_params="{}",
-        vision_embed_dim=768,
-        vision_image_size=224,
-        image_token_index=50257,
-        use_resampler=False,
-        resampler_n_latents=64,
-        resampler_depth=6,
-        resampler_n_heads=16,
-        resampler_head_dim=96,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.additional_vocab_size = additional_vocab_size
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.n_inner = n_inner
-        self.activation_function = activation_function
-        self.resid_pdrop = resid_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.attn_pdrop = attn_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.alpha_initializer = alpha_initializer
-        self.alphas_initializer_range = alphas_initializer_range
-        self.alpha_type = alpha_type
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_first_dropout = summary_first_dropout
-        self.summary_proj_to_labels = summary_proj_to_labels
-        self.scale_attn_weights = scale_attn_weights
-        self.use_cache = use_cache
-        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
-        self.reorder_and_upcast_attn = reorder_and_upcast_attn
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.cross_layer_interval = cross_layer_interval
-        self.freeze_vision_layers = freeze_vision_layers
-        self.vision_model_name = vision_model_name
-        self.vision_model_params = vision_model_params
-
-        self.tie_word_embeddings = tie_word_embeddings
-        self.freeze_text_layers = freeze_text_layers
-        self.freeze_lm_head = freeze_lm_head
-        self.image_token_index = image_token_index
-
-        self.vision_embed_dim = vision_embed_dim
-        self.vision_image_size = vision_image_size
-
-        # Resampler params
-        self.use_resampler = use_resampler
-        self.resampler_n_latents = resampler_n_latents
-        self.resampler_depth = resampler_depth
-        self.resampler_n_heads = resampler_n_heads
-        self.resampler_head_dim = resampler_head_dim
-
-        # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
-        # PretrainedConfig.from_dict first instantiates the class with the config dict and only then
-        # updates the config object with `kwargs` from from_pretrained, so during the instantiation
-        # of this object many attributes have default values and haven't yet been overridden.
-        # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.
-
-        super().__init__(
-            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
-        )
-
-    def check_compatibilities(self):
-        if self.tie_word_embeddings and (self.freeze_text_layers != self.freeze_lm_head):
-            raise ValueError(
-                "if `tie_word_embeddings` is True, then `freeze_lm_head` and `freeze_text_layers` must be equal."
-            )
-
-        vision_model_params = eval(self.vision_model_params)
-        config = AutoConfig.from_pretrained(self.vision_model_name, **vision_model_params)
-        if hasattr(config, "vision_config"):
-            vision_config = config.vision_config
-        else:
-            vision_config = config
-        vision_embed_dim = vision_config.hidden_size
-        if self.vision_embed_dim != vision_embed_dim:
-            raise ValueError(
-                f"vision_embed_dim ({self.vision_embed_dim}) must match the hidden size of the vision model"
-                f" ({vision_embed_dim})"
-            )
-        vision_image_size = vision_config.image_size
-        if self.vision_image_size != vision_image_size:
-            raise ValueError(
-                f"vision_image_size ({self.vision_image_size}) must match the hidden size of the vision model"
-                f" ({vision_image_size})"
-            )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        outputs = super(VGPT2Config, cls).from_pretrained(pretrained_model_name_or_path, **kwargs)
-        if isinstance(outputs, Tuple):
-            # When called with return_unused_kwargs=True, the first item will be the config
-            outputs[0].check_compatibilities()
-        else:
-            outputs.check_compatibilities()
-        return outputs
diff --git a/m4/models/vgpt2/modeling_vgpt2.py b/m4/models/vgpt2/modeling_vgpt2.py
deleted file mode 100644
index 7d7641662ac2897415bee4238f6e526afcd62c95..0000000000000000000000000000000000000000
--- a/m4/models/vgpt2/modeling_vgpt2.py
+++ /dev/null
@@ -1,1384 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch OpenAI GPT-2 model."""
-
-import math
-import os
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.cuda.amp import autocast
-from torch.nn import CrossEntropyLoss
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
-from transformers.pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
-from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
-from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
-
-from m4.models import DecoupledEmbedding, DecoupledLinear
-from m4.models.common import (
-    expand_inputs_for_generation,
-    prepare_inputs_for_generation,
-    update_model_kwargs_for_generation,
-)
-from m4.models.custom_modules import VLOOMPreTrainedModelBase
-from m4.models.perceiver.perceiver import PerceiverResampler
-from m4.models.vgpt2.configuration_vgpt2 import VGPT2Config
-from m4.training.utils import (
-    compute_perceiver_tflops_per_batch_per_gpu,
-    compute_tflops_per_batch_per_gpu,
-    freeze_model,
-)
-from m4.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "gpt2"
-_CONFIG_FOR_DOC = "VGPT2Config"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
-
-GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "gpt2",
-    "gpt2-medium",
-    "gpt2-large",
-    "gpt2-xl",
-    "distilgpt2",
-    # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
-]
-
-
-def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
-    """Load tf checkpoints in a pytorch model"""
-    try:
-        import re
-
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(gpt2_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info(f"Loading TF weight {name} with shape {shape}")
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array.squeeze())
-
-    for name, array in zip(names, arrays):
-        name = name[6:]  # skip "model/"
-        name = name.split("/")
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
-                scope_names = re.split(r"(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "w" or scope_names[0] == "g":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "b":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
-                pointer = getattr(pointer, scope_names[0])
-                pointer = getattr(pointer, "weight")
-            else:
-                pointer = getattr(pointer, scope_names[0])
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info(f"Initialize PyTorch weight {name}")
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-class GPT2Attention(nn.Module):
-    def __init__(self, config, is_cross_attention=False, layer_idx=None):
-        super().__init__()
-
-        max_positions = config.max_position_embeddings
-        self.register_buffer(
-            "bias",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
-                1, 1, max_positions, max_positions
-            ),
-        )
-        self.register_buffer("masked_bias", torch.tensor(-1e4))
-
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        self.split_size = self.embed_dim
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-
-        self.scale_attn_weights = config.scale_attn_weights
-        self.is_cross_attention = is_cross_attention
-
-        # Layer-wise attention scaling, reordering, and upcasting
-        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
-        self.layer_idx = layer_idx
-        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
-
-        if self.is_cross_attention:
-            in_dim = self.embed_dim if not hasattr(config, "vision_embed_dim") else config.vision_embed_dim
-            self.c_attn = Conv1D(2 * self.embed_dim, in_dim)
-            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
-        else:
-            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
-        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
-
-        self.attn_dropout = nn.Dropout(config.attn_pdrop)
-        self.resid_dropout = nn.Dropout(config.resid_pdrop)
-
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
-        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
-
-        # Prune conv1d layers
-        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
-        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
-
-        # Update hyper params
-        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
-        self.num_heads = self.num_heads - len(heads)
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.tensor(
-                value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
-            )
-
-        # Layer-wise attention scaling
-        if self.scale_attn_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-
-        if not self.is_cross_attention:
-            # if only "normal" attention layer implements causal mask
-            query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
-            mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
-    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
-        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
-        bsz, num_heads, q_seq_len, dk = query.size()
-        _, _, k_seq_len, _ = key.size()
-
-        # Preallocate attn_weights for `baddbmm`
-        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
-
-        # Compute Scale Factor
-        scale_factor = 1.0
-        if self.scale_attn_weights:
-            scale_factor /= float(value.size(-1)) ** 0.5
-
-        if self.scale_attn_by_inverse_layer_idx:
-            scale_factor /= float(self.layer_idx + 1)
-
-        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
-        with autocast(enabled=False):
-            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
-            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
-            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
-
-        if not self.is_cross_attention:
-            # if only "normal" attention layer implements causal mask
-            query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
-            mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
-        if attn_weights.dtype != torch.float32:
-            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
-    def _split_heads(self, tensor, num_heads, attn_head_size):
-        """
-        Splits hidden_size dim into attn_head_size and num_heads
-        """
-        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
-        tensor = tensor.view(new_shape)
-        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
-
-    def _merge_heads(self, tensor, num_heads, attn_head_size):
-        """
-        Merges attn_head_size dim and num_attn_heads dim into hidden_size
-        """
-        tensor = tensor.permute(0, 2, 1, 3).contiguous()
-        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
-        return tensor.view(new_shape)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        if encoder_hidden_states is not None:
-            if not hasattr(self, "q_attn"):
-                raise ValueError(
-                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
-                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
-                )
-
-            query = self.q_attn(hidden_states)
-            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
-            attention_mask = encoder_attention_mask
-        else:
-            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_heads, self.head_dim)
-        value = self._split_heads(value, self.num_heads, self.head_dim)
-
-        if layer_past is not None:
-            past_key, past_value = layer_past
-            key = torch.cat((past_key, key), dim=-2)
-            value = torch.cat((past_value, value), dim=-2)
-
-        if use_cache is True:
-            present = (key, value)
-        else:
-            present = None
-
-        if self.reorder_and_upcast_attn:
-            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
-        else:
-            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
-
-        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-        attn_output = self.c_proj(attn_output)
-        attn_output = self.resid_dropout(attn_output)
-
-        outputs = (attn_output, present)
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs  # a, present, (attentions)
-
-
-class GPT2MLP(nn.Module):
-    def __init__(self, intermediate_size, config):
-        super().__init__()
-        embed_dim = config.hidden_size
-        self.c_fc = Conv1D(intermediate_size, embed_dim)
-        self.c_proj = Conv1D(embed_dim, intermediate_size)
-        self.act = ACT2FN[config.activation_function]
-        self.dropout = nn.Dropout(config.resid_pdrop)
-
-    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
-        hidden_states = self.c_fc(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.c_proj(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        return hidden_states
-
-
-class GPT2Block(nn.Module):
-    def __init__(self, config, layer_idx=None):
-        super().__init__()
-        hidden_size = config.hidden_size
-        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
-
-        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = GPT2Attention(config, layer_idx=layer_idx)
-        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-
-        if config.add_cross_attention:
-            self.crossattention = GPT2Attention(config, is_cross_attention=True, layer_idx=layer_idx)
-            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-
-        self.mlp = GPT2MLP(inner_dim, config)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-        residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
-        attn_outputs = self.attn(
-            hidden_states,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
-        outputs = attn_outputs[1:]
-        # residual connection
-        hidden_states = attn_output + residual
-
-        if encoder_hidden_states is not None:
-            # add one self-attention block for cross-attention
-            if not hasattr(self, "crossattention"):
-                raise ValueError(
-                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
-                    "cross-attention layers by setting `config.add_cross_attention=True`"
-                )
-            residual = hidden_states
-            hidden_states = self.ln_cross_attn(hidden_states)
-            cross_attn_outputs = self.crossattention(
-                hidden_states,
-                attention_mask=attention_mask,
-                head_mask=head_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                output_attentions=output_attentions,
-            )
-            attn_output = cross_attn_outputs[0]
-            # residual connection
-            hidden_states = residual + attn_output
-            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
-
-        residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        # residual connection
-        hidden_states = residual + feed_forward_hidden_states
-
-        if use_cache:
-            outputs = (hidden_states,) + outputs
-        else:
-            outputs = (hidden_states,) + outputs[1:]
-
-        return outputs  # hidden_states, present, (attentions, cross_attentions)
-
-
-class VGPT2GatedCrossAttentionBlock(nn.Module):
-    def __init__(self, config, layer_idx=None):
-        super().__init__()
-        hidden_size = config.hidden_size
-        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
-
-        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.cross_attn = GPT2Attention(config, is_cross_attention=True, layer_idx=layer_idx)
-        self.mlp = GPT2MLP(inner_dim, config)
-        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.act = nn.Tanh()
-
-        if config.alpha_initializer == "zeros":
-            if config.alpha_type == "vector":
-                self.alpha_cross_attn = nn.Parameter(torch.zeros(1, 1, hidden_size))
-                self.alpha_dense = nn.Parameter(torch.zeros(1, 1, hidden_size))
-            elif config.alpha_type == "float":
-                self.alpha_cross_attn = nn.Parameter(torch.zeros(1))
-                self.alpha_dense = nn.Parameter(torch.zeros(1))
-            else:
-                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-        elif config.alpha_initializer == "ones":
-            if config.alpha_type == "vector":
-                self.alpha_cross_attn = nn.Parameter(torch.ones(1, 1, hidden_size))
-                self.alpha_dense = nn.Parameter(torch.ones(1, 1, hidden_size))
-            elif config.alpha_type == "float":
-                self.alpha_cross_attn = nn.Parameter(torch.ones(1))
-                self.alpha_dense = nn.Parameter(torch.ones(1))
-            else:
-                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-        elif config.alpha_initializer in {"normal", "gaussian", "random"}:
-            if config.alpha_type == "vector":
-                self.alpha_cross_attn = nn.Parameter(
-                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, hidden_size))
-                )
-                self.alpha_dense = nn.Parameter(
-                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, hidden_size))
-                )
-            elif config.alpha_type == "float":
-                self.alpha_cross_attn = nn.Parameter(
-                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1))
-                )
-                self.alpha_dense = nn.Parameter(torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1)))
-            else:
-                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-        else:
-            raise NotImplementedError(f"Alpha initialization scheme {config.alpha_initializer} not yet implemented!")
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        image_hidden_states: Optional[torch.Tensor] = None,
-        image_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-        if image_hidden_states is None:
-            raise ValueError(
-                "`image_hidden_states` is required for VGPT2 cross attention module which are visual features to be"
-                " conditioned on."
-            )
-            # add one self-attention block for cross-attention
-
-        # TODO(aps): Handle cross attention in the outputs
-        # if not hasattr(self, "crossattention"):
-        #     raise ValueError(
-        #         f"If `image_hidden_states` are passed, {self} has to be instantiated with "
-        #         "cross-attention layers by setting `config.add_cross_attention=True`"
-        #     )
-        residual = hidden_states
-
-        hidden_states = self.ln_1(hidden_states)
-        cross_attn_outputs = self.cross_attn(
-            hidden_states,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=image_hidden_states,
-            encoder_attention_mask=image_attention_mask,
-            output_attentions=output_attentions,
-        )
-        attn_output = cross_attn_outputs[0]
-        outputs = cross_attn_outputs[1:]
-        # residual connection
-        hidden_states = residual + self.act(self.alpha_cross_attn) * attn_output
-        outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
-
-        residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        # residual connection
-        hidden_states = residual + self.act(self.alpha_dense) * feed_forward_hidden_states
-
-        if use_cache:
-            outputs = (hidden_states,) + outputs
-        else:
-            outputs = (hidden_states,) + outputs[1:]
-
-        return outputs  # hidden_states, present, (attentions, cross_attentions)
-
-
-class VGPT2PreTrainedModel(VLOOMPreTrainedModelBase):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = VGPT2Config
-    load_tf_weights = load_tf_weights_in_gpt2
-    base_model_prefix = "transformer"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["GPT2Block"]
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module):
-        """Initialize the weights."""
-        if isinstance(module, (nn.Linear, Conv1D)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-        #
-        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-        for name, p in module.named_parameters():
-            if name == "c_proj.weight":
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, VGPT2Model):
-            module.gradient_checkpointing = value
-
-    @classmethod
-    def override_vision_model_wrapper(cls, model, config, vision_model_name, vision_model_params, torch_dtype):
-        # this can be called via from_pretrained from a class w/ head or w/o head so we extract the beheaded model version
-        beheaded_model = model.transformer if hasattr(model, "transformer") else model
-        cls.override_vision_model(beheaded_model, vision_model_name, vision_model_params, torch_dtype)
-        beheaded_model.freeze_relevant_params(config)
-
-
-GPT2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-    Parameters:
-        config ([`VGPT2Config`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-GPT2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
-            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
-            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
-            sequence tokens in the vocabulary.
-            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
-            `input_ids`.
-            Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
-            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
-            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
-            their past given to this model should not be passed as `input_ids` as they have already been computed.
-        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
-            `past_key_values`. In other words, the `attention_mask` always has to have the length:
-            `len(past_key_values) + len(input_ids)`
-            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
-            1]`:
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
-            `past_key_values`).
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-PARALLELIZE_DOCSTRING = r"""
-    This is an experimental feature and is a subject to change at a moment's notice.
-    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
-    it will evenly distribute blocks across all devices.
-    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
-            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
-            automatically mapped to the first device (for esoteric reasons). That means that the first device should
-            have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
-            following number of attention modules:
-                - gpt2: 12
-                - gpt2-medium: 24
-                - gpt2-large: 36
-                - gpt2-xl: 48
-    Example:
-    ```python
-    # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
-    model = GPT2LMHeadModel.from_pretrained("gpt2-xl")
-    device_map = {
-        0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
-        1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
-        2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
-        3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
-    }
-    model.parallelize(device_map)
-    ```
-"""
-DEPARALLELIZE_DOCSTRING = r"""
-    Moves the model to cpu from a model parallel state.
-    Example:
-    ```python
-    # On a 4 GPU machine with gpt2-large:
-    model = GPT2LMHeadModel.from_pretrained("gpt2-large")
-    device_map = {
-        0: [0, 1, 2, 3, 4, 5, 6, 7],
-        1: [8, 9, 10, 11, 12, 13, 14, 15],
-        2: [16, 17, 18, 19, 20, 21, 22, 23],
-        3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
-    }
-    model.parallelize(device_map)  # Splits the model across several devices
-    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
-    ```
-"""
-
-
-@add_start_docstrings(
-    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
-    GPT2_START_DOCSTRING,
-)
-class VGPT2Model(VGPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
-    def __init__(self, config, vision_model=None):
-        super().__init__(config)
-
-        self.embed_dim = config.hidden_size
-        self.config = config
-
-        self.wte = DecoupledEmbedding(
-            num_embeddings=config.vocab_size,
-            num_additional_embeddings=config.additional_vocab_size,
-            embedding_dim=self.embed_dim,
-            partially_freeze=config.freeze_text_layers,
-        )
-        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
-
-        self.drop = nn.Dropout(config.embd_pdrop)
-        self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)])
-        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-        self.cross_layer_interval = config.cross_layer_interval
-        num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
-        self.gated_cross_attn_layers = nn.ModuleList(
-            [VGPT2GatedCrossAttentionBlock(config, layer_idx=i) for i in range(num_cross_layers)]
-        )
-
-        # Perceiver Resampler
-        if config.use_resampler:
-            self.perceiver_resampler = PerceiverResampler(
-                self.config,
-                self.config.vision_embed_dim,
-                config.resampler_depth,
-                config.resampler_n_heads,
-                config.resampler_head_dim,
-                config.resampler_n_latents,
-            )
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-        self.gradient_checkpointing = False
-        # will be vocab_size because of indices starting from 0
-        self.image_token_idx = config.image_token_index
-
-        # Load an uninitialized model and later in from_pretrained will load the pre-trained model -
-        # this solves the losing of weights in `from_pretrained` on the main model
-        self.vision_model = vision_model
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        self.freeze_relevant_params(config)
-
-    def freeze_relevant_params(self, config=None):
-        if config is None:
-            config = self.config
-
-        if config.freeze_text_layers:
-            self.freeze_text_layers()
-
-        if config.freeze_vision_layers:
-            freeze_model(self.vision_model)
-
-    def freeze_text_layers(self):
-        for module in [self.wpe, self.h, self.ln_f]:
-            freeze_model(module)
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    # TODO(aps): Implement later for VGPT2
-    def parallelize(self, device_map=None):
-        # Check validity of device_map
-        self.device_map = (
-            get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
-        )
-        assert_device_map(self.device_map, len(self.h))
-        self.model_parallel = True
-        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
-        self.last_device = "cuda:" + str(max(self.device_map.keys()))
-        self.wte = self.wte.to(self.first_device)
-        self.wpe = self.wpe.to(self.first_device)
-        # Load onto devices
-        for k, v in self.device_map.items():
-            for block in v:
-                cuda_device = "cuda:" + str(k)
-                self.h[block] = self.h[block].to(cuda_device)
-        # ln_f to last
-        self.ln_f = self.ln_f.to(self.last_device)
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    # TODO(aps): Implement later for VGPT2
-    def deparallelize(self):
-        self.model_parallel = False
-        self.device_map = None
-        self.first_device = "cpu"
-        self.last_device = "cpu"
-        self.wte = self.wte.to("cpu")
-        self.wpe = self.wpe.to("cpu")
-        for index in range(len(self.h)):
-            self.h[index] = self.h[index].to("cpu")
-        self.ln_f = self.ln_f.to("cpu")
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.wte
-
-    def set_input_embeddings(self, new_embeddings):
-        self.wte = new_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        for layer, heads in heads_to_prune.items():
-            self.h[layer].attn.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPastAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_embeddings: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        crossblock_head_mask: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            batch_size = input_ids.shape[0]
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            batch_size = inputs_embeds.shape[0]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
-        if past_key_values is None:
-            past_length = 0
-            past_key_values = tuple([None] * len(self.h))
-        else:
-            past_length = past_key_values[0][0].size(-2)
-        if position_ids is None:
-            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        # GPT2Attention mask.
-        if attention_mask is not None:
-            if batch_size <= 0:
-                raise ValueError("batch_size has to be defined and > 0")
-            attention_mask = attention_mask.view(batch_size, -1)
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask[:, None, None, :]
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and the dtype's smallest value for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if pixel_values is not None and image_embeddings is not None:
-            raise ValueError("You cannot specify both pixel_values and image_embeddings at the same time")
-        elif pixel_values is not None:
-            pixel_values = pixel_values.to(dtype=self.dtype, device=input_ids.device)  # fp16 compatibility
-            batch_size, num_images = pixel_values.size(0), pixel_values.size(1)
-            pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
-            # Get sequence from the vision encoder
-            image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state
-        elif image_embeddings is not None:
-            batch_size, num_images, image_seq_len, image_hidden_size = image_embeddings.size()
-            image_hidden_states = image_embeddings.to(dtype=self.dtype, device=input_ids.device)
-            image_hidden_states = image_hidden_states.view(batch_size * num_images, image_seq_len, image_hidden_size)
-
-        if self.config.use_resampler:
-            image_hidden_states = self.perceiver_resampler(image_hidden_states)
-        image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2)
-        image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size)
-
-        # Make image_attention_mask compatible with hidden states
-        text_seq_len = image_attention_mask.size(1)
-        image_attention_mask = image_attention_mask.unsqueeze(-1)
-        image_attention_mask = image_attention_mask.repeat(1, 1, 1, image_seq_len)
-        image_attention_mask = image_attention_mask.view(batch_size, text_seq_len, num_images * image_seq_len)
-        if image_hidden_states is not None:
-            image_batch_size, image_sequence_length, _ = image_hidden_states.size()
-            image_hidden_shape = (image_batch_size, image_sequence_length)
-            if image_attention_mask is None:
-                image_attention_mask = torch.ones(image_hidden_shape, device=device)
-            image_attention_mask = self.invert_attention_mask(image_attention_mask)
-        else:
-            image_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-
-        position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
-
-        if token_type_ids is not None:
-            token_type_embeds = self.wte(token_type_ids)
-            hidden_states = hidden_states + token_type_embeds
-
-        hidden_states = self.drop(hidden_states)
-
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-        all_hidden_states = () if output_hidden_states else None
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            # Model parallel
-            if self.model_parallel:
-                torch.cuda.set_device(hidden_states.device)
-                # Ensure layer_past is on same device as hidden_states (might not be correct)
-                if layer_past is not None:
-                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
-                # Ensure that attention_mask is always on the same device as hidden_states
-                if attention_mask is not None:
-                    attention_mask = attention_mask.to(hidden_states.device)
-                if isinstance(head_mask, torch.Tensor):
-                    head_mask = head_mask.to(hidden_states.device)
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            def vblock(
-                main_block,
-                hidden_states,
-                layer_past,
-                attention_mask,
-                layer_head_mask,
-                use_cache,
-                output_attentions,
-                image_hidden_states,
-                image_attention_mask,
-                layer_idx,
-                cross_layer_interval,
-                gated_cross_attn_layers,
-            ):
-                # TODO(aps): Add cross attention values to respective lists
-                # TODO(aps): Add xblock head mask support
-                if layer_idx % cross_layer_interval == 0:
-                    xblock = gated_cross_attn_layers[layer_idx // cross_layer_interval]
-                    outputs = xblock(
-                        hidden_states,
-                        attention_mask=attention_mask,
-                        image_hidden_states=image_hidden_states,
-                        image_attention_mask=image_attention_mask,
-                        use_cache=use_cache,
-                        output_attentions=output_attentions,
-                    )
-                    hidden_states = outputs[0]
-
-                outputs = main_block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    head_mask=layer_head_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-                return outputs
-
-            if self.gradient_checkpointing and self.training:
-                layer_past = None
-                if use_cache:
-                    logger.warning_once(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    vblock,
-                    block,
-                    hidden_states,
-                    layer_past,
-                    attention_mask,
-                    head_mask[i],
-                    use_cache,
-                    output_attentions,
-                    image_hidden_states,
-                    image_attention_mask,
-                    i,
-                    self.cross_layer_interval,
-                    self.gated_cross_attn_layers,
-                )
-            else:
-                outputs = vblock(
-                    block,
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    layer_head_mask=head_mask[i],
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    image_hidden_states=image_hidden_states,
-                    image_attention_mask=image_attention_mask,
-                    layer_idx=i,
-                    cross_layer_interval=self.cross_layer_interval,
-                    gated_cross_attn_layers=self.gated_cross_attn_layers,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
-
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-        hidden_states = self.ln_f(hidden_states)
-
-        hidden_states = hidden_states.view(output_shape)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
-                if v is not None
-            )
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
-    embeddings).
-    """,
-    GPT2_START_DOCSTRING,
-)
-class VGPT2LMHeadModel(VGPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
-
-    def __init__(self, config, vision_model=None):
-        super().__init__(config)
-        self.transformer = VGPT2Model(config, vision_model=vision_model)
-        self.lm_head = DecoupledLinear(
-            in_features=config.n_embd,
-            out_features=config.vocab_size,
-            out_additional_features=config.additional_vocab_size,
-            bias=False,
-            partially_freeze=config.freeze_lm_head,
-        )
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
-            if device_map is None
-            else device_map
-        )
-        assert_device_map(self.device_map, len(self.transformer.h))
-        self.transformer.parallelize(self.device_map)
-        self.lm_head = self.lm_head.to(self.transformer.first_device)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.transformer.deparallelize()
-        self.transformer = self.transformer.to("cpu")
-        self.lm_head = self.lm_head.to("cpu")
-        self.model_parallel = False
-        torch.cuda.empty_cache()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def tie_weights(self):
-        """
-        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
-        """
-        output_embeddings = self.get_output_embeddings()
-        input_embeddings = self.get_input_embeddings()
-
-        if getattr(self.config, "tie_word_embeddings", True):
-            output_embeddings.weight = input_embeddings.weight
-            if input_embeddings.num_additional_embeddings > 0:
-                assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
-                output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
-
-        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
-            output_embeddings.out_features = input_embeddings.num_embeddings
-            if hasattr(output_embeddings, "out_additional_features") and hasattr(
-                input_embeddings, "num_additional_embeddings"
-            ):
-                output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
-
-    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
-        return prepare_inputs_for_generation(input_ids, past=past, **kwargs)
-
-    @staticmethod
-    def _expand_inputs_for_generation(
-        *args,
-        **model_kwargs,
-    ):
-        return expand_inputs_for_generation(*args, **model_kwargs)
-
-    @staticmethod
-    def _update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
-        return update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder)
-
-    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=CausalLMOutputWithCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_embeddings: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        crossblock_head_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            pixel_values=pixel_values,
-            image_embeddings=image_embeddings,
-            image_attention_mask=image_attention_mask,
-            crossblock_head_mask=crossblock_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.transformer.first_device)
-            hidden_states = hidden_states.to(self.lm_head.weight.device)
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
-                shift_logits = lm_logits[..., :-1, :][shift_attention_mask != 0].contiguous()
-                shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
-            else:
-                shift_logits = lm_logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-            cross_attentions=transformer_outputs.cross_attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-        """
-        return tuple(
-            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
-            for layer_past in past
-        )
-
-    def get_model_tflops_per_batch_per_gpu(self, hparams, data_param, tokenizer, max_num_images):
-        config_vl_model = self.config
-
-        language_embed_size = config_vl_model.n_embd
-        num_language_layers = config_vl_model.n_layer
-        ffn_inner_size = config_vl_model.n_inner
-
-        vision_config = self.transformer.vision_model.config
-        if hasattr(vision_config, "vision_config"):
-            vision_config = vision_config.vision_config
-
-        # Get vision model blocks infos
-        vision_patch_size = vision_config.patch_size
-        vision_hidden_size = vision_config.hidden_size
-        num_vision_layers = vision_config.num_hidden_layers
-        # The +1 is for the CLS token
-        single_image_seq_len = (vision_config.image_size // vision_patch_size) ** 2 + 1
-        vision_exp_factor = vision_config.intermediate_size // vision_hidden_size
-
-        # Get language and cross-att blocks infos
-        num_cross_attn_layers = num_language_layers // config_vl_model.cross_layer_interval
-        language_seq_len = data_param.max_seq_len
-        language_exp_factor = (ffn_inner_size // language_embed_size) if ffn_inner_size is not None else 4
-        cross_att_exp_factor = (ffn_inner_size // language_embed_size) if ffn_inner_size is not None else 4
-        k_v_cross_attn_seq_len = (
-            (self.config.resampler_n_latents * max_num_images)
-            if self.config.use_resampler
-            else (single_image_seq_len * max_num_images)
-        )
-
-        language_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_language_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=language_seq_len,
-            k_seq_len=language_seq_len,
-            hidden_size=language_embed_size,
-            kv_in_dim=language_embed_size,
-            ff_exp_factor=language_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=False,
-            vocab_size=tokenizer.vocab_size,
-            count_backward=True,  # Always True regardless of freezing, because gradients are computed for cross-attentions
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        cross_attention_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_cross_attn_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=language_seq_len,
-            k_seq_len=k_v_cross_attn_seq_len,
-            hidden_size=language_embed_size,
-            kv_in_dim=vision_hidden_size,
-            ff_exp_factor=cross_att_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=False,
-            vocab_size=None,
-            count_backward=True,
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        vision_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_vision_layers,
-            batch_size=hparams.batch_size_per_gpu * max_num_images,
-            q_seq_len=single_image_seq_len,
-            k_seq_len=single_image_seq_len,
-            hidden_size=vision_hidden_size,
-            kv_in_dim=vision_hidden_size,
-            ff_exp_factor=vision_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=False,
-            vocab_size=None,
-            count_backward=not hparams.model_params["freeze_vision_layers"],
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        if self.config.use_resampler:
-            perceiver_tflops_per_batch_per_gpu = compute_perceiver_tflops_per_batch_per_gpu(
-                num_layers=self.config.resampler_depth,
-                batch_size=hparams.batch_size_per_gpu * max_num_images,
-                q_seq_len=self.config.resampler_n_latents,
-                vision_embed_seq_len=single_image_seq_len,
-                q_k_v_input_dim=vision_hidden_size,
-                attention_hidden_size=self.config.resampler_n_heads * self.config.resampler_head_dim,
-                ff_exp_factor=cross_att_exp_factor,
-                count_backward=True,
-                use_grad_checkpointing=hparams.gradient_checkpointing,
-            )
-            flop_count = (
-                language_tflops_per_batch_per_gpu
-                + cross_attention_tflops_per_batch_per_gpu
-                + vision_tflops_per_batch_per_gpu
-                + perceiver_tflops_per_batch_per_gpu
-            )
-        else:
-            flop_count = (
-                language_tflops_per_batch_per_gpu
-                + cross_attention_tflops_per_batch_per_gpu
-                + vision_tflops_per_batch_per_gpu
-            )
-        return flop_count
diff --git a/m4/models/vgpt_neo/__init__.py b/m4/models/vgpt_neo/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/m4/models/vgpt_neo/configuration_vgpt_neo.py b/m4/models/vgpt_neo/configuration_vgpt_neo.py
deleted file mode 100644
index 14a92901ca755027ae3df488f3fda488d3ea0e45..0000000000000000000000000000000000000000
--- a/m4/models/vgpt_neo/configuration_vgpt_neo.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" GPT Neo model configuration"""
-import os
-from typing import Tuple, Union
-
-from transformers import AutoConfig
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "EleutherAI/gpt-neo-125M": "https://huggingface.co/EleutherAI/gpt-neo-125M/resolve/main/config.json",
-    "EleutherAI/gpt-neo-1.3B": "https://huggingface.co/EleutherAI/gpt-neo-1.3B/resolve/main/config.json",
-    # See all GPTNeo models at https://huggingface.co/models?filter=gpt_neo
-}
-
-
-class VGPTNeoConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`GPTNeoModel`]. It is used to instantiate a GPT
-    Neo model according to the specified arguments, defining the model architecture. Instantiating a configuration with
-    the defaults will yield a similar configuration to that of the GPTNeo
-    [EleutherAI/gpt-neo-1.3B](https://huggingface.co/EleutherAI/gpt-neo-1.3B) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    TODO: this doc is completely out of sync with the actual args
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 50257):
-            Vocabulary size of the GPT Neo model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`GPTNeoModel`]. Vocabulary size of the model. Defines the different
-            tokens that can be represented by the *inputs_ids* passed to the forward method of [`GPTNeoModel`].
-        additional_vocab_size (`int`, *optional`, defaults to 0):
-            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
-            are always trainable whereas regular vocab tokens can be frozen or not.
-        attention_types (`List`, *optional*, defaults to `[[["global", "local"], 12]]`):
-            The type of attention for each layer in a `List` of the following format `[[["attention_type"],
-            num_layerss]]` e.g. for a 24 layer model `[[["global"], 24]]` or `[[["global", "local"], 12]]` Choose the
-            value of `attention_type` from `["global", "local"]`
-        hidden_size (`int`, *optional*, defaults to 2048):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_layers (`int`, *optional*, defaults to 24):
-            Number of hidden layers in the Transformer encoder.
-        num_heads (`int`, *optional*, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 8192):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        activation_function (`str` or `function`, *optional*, defaults to `"gelu_new"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        embed_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`GPTNeoModel`].
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        alpha_initializer (`str`, *optional*, defaults to `"ones"`):
-            Initialization type for the alphas.
-        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
-            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross Attention.
-        alpha_type (`str`, *optional*, defaults to `"vector"`):
-            Whether the gating alphas should be vectors or single floats.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon used by the layer normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        cross_layer_interval (`int`, *optional*, default to 1)
-            Interval for cross attention (from text to image) layers.
-    Example:
-    ```python
-    >>> from transformers import GPTNeoConfig, GPTNeoModel
-    >>> # Initializing a GPTNeo EleutherAI/gpt-neo-1.3B style configuration
-    >>> configuration = GPTNeoConfig()
-    >>> # Initializing a model (with random weights) from the EleutherAI/gpt-neo-1.3B style configuration
-    >>> model = GPTNeoModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "vgpt_neo"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {"num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
-
-    def __init__(
-        self,
-        vocab_size=50257,
-        additional_vocab_size=0,
-        max_position_embeddings=2048,
-        hidden_size=2048,
-        num_layers=24,
-        attention_types=[[["global", "local"], 12]],
-        num_heads=16,
-        intermediate_size=None,
-        window_size=256,
-        activation_function="gelu_new",
-        resid_dropout=0.0,
-        embed_dropout=0.0,
-        attention_dropout=0.0,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        alpha_initializer="ones",
-        alphas_initializer_range=0.0,
-        alpha_type="vector",
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        use_cache=True,
-        bos_token_id=50256,
-        eos_token_id=50256,
-        cross_layer_interval=1,
-        tie_word_embeddings=False,
-        freeze_text_layers=True,
-        freeze_lm_head=False,
-        freeze_vision_layers=True,
-        vision_model_name="google/vit-base-patch16-224",
-        vision_model_params="{}",
-        vision_embed_dim=768,
-        vision_image_size=224,
-        image_token_index=50257,
-        use_resampler=False,
-        resampler_n_latents=64,
-        resampler_depth=6,
-        resampler_n_heads=16,
-        resampler_head_dim=96,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.additional_vocab_size = additional_vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.num_heads = num_heads
-        self.intermediate_size = intermediate_size
-        self.window_size = window_size
-        self.activation_function = activation_function
-        self.resid_dropout = resid_dropout
-        self.embed_dropout = embed_dropout
-        self.attention_dropout = attention_dropout
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.alpha_initializer = alpha_initializer
-        self.alphas_initializer_range = alphas_initializer_range
-        self.alpha_type = alpha_type
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_first_dropout = summary_first_dropout
-        self.summary_proj_to_labels = summary_proj_to_labels
-        self.use_cache = use_cache
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.cross_layer_interval = cross_layer_interval
-        self.freeze_vision_layers = freeze_vision_layers
-        self.vision_model_name = vision_model_name
-        self.vision_model_params = vision_model_params
-
-        self.tie_word_embeddings = tie_word_embeddings
-        self.freeze_text_layers = freeze_text_layers
-        self.freeze_lm_head = freeze_lm_head
-        self.image_token_index = image_token_index
-        self.attention_types = attention_types
-        self.attention_layers = self.expand_attention_types_params(attention_types)
-
-        self.vision_embed_dim = vision_embed_dim
-        self.vision_image_size = vision_image_size
-
-        # Resampler params
-        self.use_resampler = use_resampler
-        self.resampler_n_latents = resampler_n_latents
-        self.resampler_depth = resampler_depth
-        self.resampler_n_heads = resampler_n_heads
-        self.resampler_head_dim = resampler_head_dim
-
-        # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
-        # PretrainedConfig.from_dict first instantiates the class with the config dict and only then
-        # updates the config object with `kwargs` from from_pretrained, so during the instantiation
-        # of this object many attributes have default values and haven't yet been overridden.
-        # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.
-
-        super().__init__(
-            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
-        )
-
-    def check_compatibilities(self):
-        if self.tie_word_embeddings and (self.freeze_text_layers != self.freeze_lm_head):
-            raise ValueError(
-                "if `tie_word_embeddings` is True, then `freeze_lm_head` and `freeze_text_layers` must be equal."
-            )
-
-        vision_model_params = eval(self.vision_model_params)
-        config = AutoConfig.from_pretrained(self.vision_model_name, **vision_model_params)
-        if hasattr(config, "vision_config"):
-            vision_config = config.vision_config
-        else:
-            vision_config = config
-        vision_embed_dim = vision_config.hidden_size
-        if self.vision_embed_dim != vision_embed_dim:
-            raise ValueError(
-                f"vision_embed_dim ({self.vision_embed_dim}) must match the hidden size of the vision model"
-                f" ({vision_embed_dim})"
-            )
-        vision_image_size = vision_config.image_size
-        if self.vision_image_size != vision_image_size:
-            raise ValueError(
-                f"vision_image_size ({self.vision_image_size}) must match the hidden size of the vision model"
-                f" ({vision_image_size})"
-            )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        outputs = super(VGPTNeoConfig, cls).from_pretrained(pretrained_model_name_or_path, **kwargs)
-        if isinstance(outputs, Tuple):
-            # When called with return_unused_kwargs=True, the first item will be the config
-            outputs[0].check_compatibilities()
-        else:
-            outputs.check_compatibilities()
-        return outputs
-
-    @staticmethod
-    def expand_attention_types_params(attention_types):
-        attentions = []
-        for item in attention_types:
-            for _ in range(item[1]):
-                attentions.extend(item[0])
-        return attentions
diff --git a/m4/models/vgpt_neo/modeling_vgpt_neo.py b/m4/models/vgpt_neo/modeling_vgpt_neo.py
deleted file mode 100644
index a48c232b9e4716fa018a756ebe03851cb29c419e..0000000000000000000000000000000000000000
--- a/m4/models/vgpt_neo/modeling_vgpt_neo.py
+++ /dev/null
@@ -1,1182 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The Eleuther AI and HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch GPT Neo model."""
-
-
-import os
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
-from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
-
-from m4.models import DecoupledEmbedding, DecoupledLinear
-from m4.models.common import (
-    expand_inputs_for_generation,
-    prepare_inputs_for_generation,
-    update_model_kwargs_for_generation,
-)
-from m4.models.custom_modules import VLOOMPreTrainedModelBase
-from m4.models.perceiver.perceiver import PerceiverResampler
-from m4.models.vgpt_neo.configuration_vgpt_neo import VGPTNeoConfig
-from m4.training.utils import (
-    compute_perceiver_tflops_per_batch_per_gpu,
-    compute_tflops_per_batch_per_gpu,
-    freeze_model,
-)
-from m4.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neo-1.3B"
-_CONFIG_FOR_DOC = "VGPTNeoConfig"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
-
-GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "EleutherAI/gpt-neo-125M",
-    "EleutherAI/gpt-neo-1.3B",
-    # See all GPTNeo models at https://huggingface.co/models?filter=gpt_neo
-]
-
-
-def load_tf_weights_in_gpt_neo(model, config, gpt_neo_checkpoint_path):
-    """Load tf checkpoints in a pytorch model"""
-    try:
-        import re
-
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(gpt_neo_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        if "global_step" not in name and "adam" not in name:
-            array = tf.train.load_variable(tf_path, name)
-            array = tf.dtypes.cast(array.squeeze(), tf.float32).numpy()
-            name = name.replace("attn/q", "attn/attention/q_proj/w")
-            name = name.replace("attn/k", "attn/attention/k_proj/w")
-            name = name.replace("attn/v", "attn/attention/v_proj/w")
-            name = name.replace("attn/o", "attn/attention/out_proj/w")
-            name = name.replace("norm_1", "ln_1")
-            name = name.replace("norm_2", "ln_2")
-            name = name.replace("attn/compute_output_bias/o_b", "attn/attention/out_proj/b")
-            name = name.replace("conv1d_main/c_fc/kernel", "c_fc/w")
-            name = name.replace("conv1d_main/c_fc/bias", "c_fc/b")
-            name = name.replace("conv1d_main/c_proj/kernel", "c_proj/w")
-            name = name.replace("conv1d_main/c_proj/bias", "c_proj/b")
-
-            names.append(name)
-            arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name[5:]  # skip "gpt2/"
-        name = name.split("/")
-        pointer = model.transformer
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
-                scope_names = re.split(r"(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "w" or scope_names[0] == "g":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "b":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
-                pointer = getattr(pointer, scope_names[0])
-                pointer = getattr(pointer, "weight")
-            else:
-                pointer = getattr(pointer, scope_names[0])
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-
-        if name[-1] == "w" and name[-2] in ["out_proj", "k_proj", "q_proj", "v_proj", "c_proj", "c_fc"]:
-            array = array.transpose()
-
-        if name == ["wte"]:
-            # if vocab is padded, then trim off the padding embeddings
-            array = array[: config.vocab_size]
-
-        if pointer.shape != array.shape:
-            raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched {name}")
-
-        print(f"Initialize PyTorch weight {name}")
-        pointer.data = torch.from_numpy(array)
-
-    # init the final linear layer using word embeddings
-    embs = model.transformer.wte.weight
-    lin = nn.Linear(embs.size()[1], embs.size()[0], bias=False)
-    lin.weight = embs
-    model.set_output_embeddings(lin)
-    return model
-
-
-class GPTNeoSelfAttention(nn.Module):
-    def __init__(self, config, attention_type, is_cross_attention=False):
-        super().__init__()
-
-        max_positions = config.max_position_embeddings
-        bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
-            1, 1, max_positions, max_positions
-        )
-
-        # local causal self attention is a sliding window where each token can only attend to the previous
-        # window_size tokens. This is implemented by updating the causal mask such that for each token
-        # all other tokens are masked except the previous window_size tokens.
-        if attention_type == "local":
-            bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size))
-        self.is_cross_attention = is_cross_attention
-        self.register_buffer("bias", bias)
-        self.register_buffer("masked_bias", torch.tensor(-1e9))
-
-        self.attn_dropout = nn.Dropout(float(config.attention_dropout))
-        self.resid_dropout = nn.Dropout(float(config.resid_dropout))
-
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-        if self.is_cross_attention:
-            in_dim = self.embed_dim if not hasattr(config, "vision_embed_dim") else config.vision_embed_dim
-            self.k_proj = nn.Linear(in_dim, self.embed_dim, bias=False)
-            self.v_proj = nn.Linear(in_dim, self.embed_dim, bias=False)
-        else:
-            self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
-            self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
-
-    def _split_heads(self, tensor, num_heads, attn_head_size):
-        """
-        Splits hidden_size dim into attn_head_size and num_heads
-        """
-        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
-        tensor = tensor.view(new_shape)
-        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
-
-    def _merge_heads(self, tensor, num_heads, attn_head_size):
-        """
-        Merges attn_head_size dim and num_attn_heads dim into hidden_size
-        """
-        tensor = tensor.permute(0, 2, 1, 3).contiguous()
-        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
-        return tensor.view(new_shape)
-
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        # Keep the attention weights computation in fp32 to avoid overflow issues
-        query = query.to(torch.float32)
-        key = key.to(torch.float32)
-
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        if not self.is_cross_attention:
-            query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
-            mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        attn_weights = attn_weights.to(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        if encoder_hidden_states is not None:
-            key = self.k_proj(encoder_hidden_states)
-            value = self.v_proj(encoder_hidden_states)
-            attention_mask = encoder_attention_mask
-        else:
-            key = self.k_proj(hidden_states)
-            value = self.v_proj(hidden_states)
-        query = self.q_proj(hidden_states)
-
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_heads, self.head_dim)
-        value = self._split_heads(value, self.num_heads, self.head_dim)
-
-        if layer_past is not None:
-            past_key = layer_past[0]
-            past_value = layer_past[1]
-            key = torch.cat((past_key, key), dim=-2)
-            value = torch.cat((past_value, value), dim=-2)
-
-        if use_cache is True:
-            present = (key, value)
-        else:
-            present = None
-
-        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
-
-        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-        attn_output = self.out_proj(attn_output)
-        attn_output = self.resid_dropout(attn_output)
-
-        outputs = (attn_output, present)
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs  # a, present, (attentions)
-
-
-class GPTNeoAttention(nn.Module):
-    def __init__(self, config, layer_id=0, is_cross_attention=False):
-        super().__init__()
-        self.layer_id = layer_id
-        self.attention_layers = config.attention_layers
-        self.attention_type = self.attention_layers[layer_id]
-        if self.attention_type in ["global", "local"]:
-            self.attention = GPTNeoSelfAttention(config, self.attention_type, is_cross_attention=is_cross_attention)
-        else:
-            raise NotImplementedError(
-                "Only attn layer types 'global' and 'local' exist, but got `config.attention_layers`: "
-                f"{config.attention_layers}. Select attn layer types from ['global', 'local'] only."
-            )
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        return self.attention(
-            hidden_states,
-            attention_mask=attention_mask,
-            layer_past=layer_past,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-
-
-class GPTNeoMLP(nn.Module):
-    def __init__(self, intermediate_size, config):  # in MLP: intermediate_size= 4 * hidden_size
-        super().__init__()
-        embed_dim = config.hidden_size
-        self.c_fc = nn.Linear(embed_dim, intermediate_size)
-        self.c_proj = nn.Linear(intermediate_size, embed_dim)
-        self.act = ACT2FN[config.activation_function]
-        self.dropout = nn.Dropout(float(config.resid_dropout))
-
-    def forward(self, hidden_states):
-        hidden_states = self.c_fc(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.c_proj(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        return hidden_states
-
-
-class GPTNeoBlock(nn.Module):
-    def __init__(self, config, layer_id):
-        super().__init__()
-        hidden_size = config.hidden_size
-        inner_dim = config.intermediate_size if config.intermediate_size is not None else 4 * hidden_size
-        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = GPTNeoAttention(config, layer_id, is_cross_attention=False)
-        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-
-        self.mlp = GPTNeoMLP(inner_dim, config)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-        residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
-        attn_outputs = self.attn(
-            hidden_states,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
-        outputs = attn_outputs[1:]
-        # residual connection
-        hidden_states = attn_output + residual
-        residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        # residual connection
-        hidden_states = residual + feed_forward_hidden_states
-
-        if use_cache:
-            outputs = (hidden_states,) + outputs
-        else:
-            outputs = (hidden_states,) + outputs[1:]
-
-        return outputs  # hidden_states, present, (attentions, cross_attentions)
-
-
-class VGPTNeoGatedCrossAttentionBlock(nn.Module):
-    def __init__(self, config, layer_id):
-        super().__init__()
-        hidden_size = config.hidden_size
-        inner_dim = config.intermediate_size if config.intermediate_size is not None else 4 * hidden_size
-
-        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.cross_attn = GPTNeoAttention(config, layer_id, is_cross_attention=True)
-        self.mlp = GPTNeoMLP(inner_dim, config)
-        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.act = nn.Tanh()
-
-        if config.alpha_initializer == "zeros":
-            if config.alpha_type == "vector":
-                self.alpha_cross_attn = nn.Parameter(torch.zeros(1, 1, hidden_size))
-                self.alpha_dense = nn.Parameter(torch.zeros(1, 1, hidden_size))
-            elif config.alpha_type == "float":
-                self.alpha_cross_attn = nn.Parameter(torch.zeros(1))
-                self.alpha_dense = nn.Parameter(torch.zeros(1))
-            else:
-                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-        elif config.alpha_initializer == "ones":
-            if config.alpha_type == "vector":
-                self.alpha_cross_attn = nn.Parameter(torch.ones(1, 1, hidden_size))
-                self.alpha_dense = nn.Parameter(torch.ones(1, 1, hidden_size))
-            elif config.alpha_type == "float":
-                self.alpha_cross_attn = nn.Parameter(torch.ones(1))
-                self.alpha_dense = nn.Parameter(torch.ones(1))
-            else:
-                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-        elif config.alpha_initializer in {"normal", "gaussian", "random"}:
-            if config.alpha_type == "vector":
-                self.alpha_cross_attn = nn.Parameter(
-                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, hidden_size))
-                )
-                self.alpha_dense = nn.Parameter(
-                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, hidden_size))
-                )
-            elif config.alpha_type == "float":
-                self.alpha_cross_attn = nn.Parameter(
-                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1))
-                )
-                self.alpha_dense = nn.Parameter(torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1)))
-            else:
-                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-        else:
-            raise NotImplementedError(f"Alpha initialization scheme {config.alpha_initializer} not yet implemented!")
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        image_hidden_states: Optional[torch.Tensor] = None,
-        image_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-        if image_hidden_states is None:
-            raise ValueError(
-                "`image_hidden_states` is required for VGPT2 cross attention module which are visual features to be"
-                " conditioned on."
-            )
-            # add one self-attention block for cross-attention
-
-        # TODO(aps): Handle cross attention in the outputs
-        # if not hasattr(self, "crossattention"):
-        #     raise ValueError(
-        #         f"If `image_hidden_states` are passed, {self} has to be instantiated with "
-        #         "cross-attention layers by setting `config.add_cross_attention=True`"
-        #     )
-        residual = hidden_states
-
-        hidden_states = self.ln_1(hidden_states)
-        cross_attn_outputs = self.cross_attn(
-            hidden_states,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=image_hidden_states,
-            encoder_attention_mask=image_attention_mask,
-            output_attentions=output_attentions,
-        )
-        attn_output = cross_attn_outputs[0]
-        outputs = cross_attn_outputs[1:]
-        # residual connection
-        hidden_states = residual + self.act(self.alpha_cross_attn) * attn_output
-        outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
-
-        residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        # residual connection
-        hidden_states = residual + self.act(self.alpha_dense) * feed_forward_hidden_states
-
-        if use_cache:
-            outputs = (hidden_states,) + outputs
-        else:
-            outputs = (hidden_states,) + outputs[1:]
-
-        return outputs
-
-
-class VGPTNeoPreTrainedModel(VLOOMPreTrainedModelBase):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = VGPTNeoConfig
-    load_tf_weights = load_tf_weights_in_gpt_neo
-    base_model_prefix = "transformer"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["GPTNeoBlock"]
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module):
-        """Initialize the weights."""
-        if isinstance(module, (nn.Linear,)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, VGPTNeoModel):
-            module.gradient_checkpointing = value
-
-    @classmethod
-    def override_vision_model_wrapper(cls, model, config, vision_model_name, vision_model_params, torch_dtype):
-        # this can be called via from_pretrained from a class w/ head or w/o head so we extract the beheaded model version
-        beheaded_model = model.transformer if hasattr(model, "transformer") else model
-        cls.override_vision_model(beheaded_model, vision_model_name, vision_model_params, torch_dtype)
-        beheaded_model.freeze_relevant_params(config)
-
-
-GPT_NEO_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-    Parameters:
-        config ([`GPTNeoConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-GPT_NEO_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
-            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
-            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
-            sequence tokens in the vocabulary.
-            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
-            `input_ids`.
-            Indices can be obtained using [`GPTNeoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.num_layers`):
-            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
-            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
-            their past given to this model should not be passed as `input_ids` as they have already been computed.
-        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
-            1]`:
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
-            `past_key_values`).
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare GPT Neo Model transformer outputting raw hidden-states without any specific head on top.",
-    GPT_NEO_START_DOCSTRING,
-)
-class VGPTNeoModel(VGPTNeoPreTrainedModel):
-    def __init__(self, config, vision_model=None):
-        super().__init__(config)
-
-        self.embed_dim = config.hidden_size
-        self.wte = DecoupledEmbedding(
-            num_embeddings=config.vocab_size,
-            num_additional_embeddings=config.additional_vocab_size,
-            embedding_dim=self.embed_dim,
-            partially_freeze=config.freeze_text_layers,
-        )
-        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
-        self.drop = nn.Dropout(float(config.embed_dropout))
-        self.h = nn.ModuleList([GPTNeoBlock(config, layer_id=i) for i in range(config.num_layers)])
-        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-        self.cross_layer_interval = config.cross_layer_interval
-        num_cross_layers = config.num_layers // self.cross_layer_interval
-        self.gated_cross_attn_layers = nn.ModuleList(
-            [VGPTNeoGatedCrossAttentionBlock(config, layer_id=i) for i in range(num_cross_layers)]
-        )
-
-        # Perceiver Resampler
-        if config.use_resampler:
-            self.perceiver_resampler = PerceiverResampler(
-                self.config,
-                self.config.vision_embed_dim,
-                config.resampler_depth,
-                config.resampler_n_heads,
-                config.resampler_head_dim,
-                config.resampler_n_latents,
-            )
-        self.gradient_checkpointing = False
-        self.image_token_idx = config.image_token_index
-
-        # Load an uninitialized model and later in from_pretrained will load the pre-trained model -
-        # this solves the losing of weights in `from_pretrained` on the main model
-        self.vision_model = vision_model
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        self.freeze_relevant_params(config)
-
-    def freeze_relevant_params(self, config=None):
-        if config is None:
-            config = self.config
-
-        if config.freeze_text_layers:
-            self.freeze_text_layers()
-
-        if config.freeze_vision_layers:
-            freeze_model(self.vision_model)
-
-    def freeze_text_layers(self):
-        for module in [self.wpe, self.h, self.ln_f]:
-            freeze_model(module)
-
-    def get_input_embeddings(self):
-        return self.wte
-
-    def set_input_embeddings(self, new_embeddings):
-        self.wte = new_embeddings
-
-    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPastAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_embeddings: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        crossblock_head_mask: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            batch_size = input_ids.shape[0]
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            batch_size = inputs_embeds.shape[0]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
-        if past_key_values is None:
-            past_length = 0
-            past_key_values = tuple([None] * len(self.h))
-        else:
-            past_length = past_key_values[0][0].size(-2)
-        if position_ids is None:
-            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        # GPT2Attention mask.
-        if attention_mask is not None:
-            if batch_size <= 0:
-                raise ValueError("batch_size has to be defined and > 0")
-            attention_mask = attention_mask.view(batch_size, -1)
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask[:, None, None, :]
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and the dtype's smallest value for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if pixel_values is not None and image_embeddings is not None:
-            raise ValueError("You cannot specify both pixel_values and image_embeddings at the same time")
-        elif pixel_values is not None:
-            pixel_values = pixel_values.to(dtype=self.dtype, device=input_ids.device)  # fp16 compatibility
-            batch_size, num_images = pixel_values.size(0), pixel_values.size(1)
-            pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
-            # Get sequence from the vision encoder
-            image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state
-        elif image_embeddings is not None:
-            batch_size, num_images, image_seq_len, image_hidden_size = image_embeddings.size()
-            image_hidden_states = image_embeddings.to(dtype=self.dtype, device=input_ids.device)
-            image_hidden_states = image_hidden_states.view(batch_size * num_images, image_seq_len, image_hidden_size)
-
-        if self.config.use_resampler:
-            image_hidden_states = self.perceiver_resampler(image_hidden_states)
-        image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2)
-        image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size)
-        # Make image_attention_mask compatible with hidden states
-        text_seq_len = image_attention_mask.size(1)
-        image_attention_mask = image_attention_mask.unsqueeze(-1)
-        image_attention_mask = image_attention_mask.repeat(1, 1, 1, image_seq_len)
-        image_attention_mask = image_attention_mask.view(batch_size, text_seq_len, num_images * image_seq_len)
-
-        if image_hidden_states is not None:
-            image_batch_size, image_sequence_length, _ = image_hidden_states.size()
-            image_hidden_shape = (image_batch_size, image_sequence_length)
-            if image_attention_mask is None:
-                image_attention_mask = torch.ones(image_hidden_shape, device=device)
-            # image_attention_mask = self.invert_attention_mask(image_attention_mask)
-            image_attention_mask = image_attention_mask.to(torch.bool)
-            image_attention_mask = image_attention_mask[:, None, :, :]
-        else:
-            image_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-
-        position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
-
-        if token_type_ids is not None:
-            token_type_embeds = self.wte(token_type_ids)
-            hidden_states = hidden_states + token_type_embeds
-
-        hidden_states = self.drop(hidden_states)
-
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-        all_hidden_states = () if output_hidden_states else None
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            def vblock(
-                main_block,
-                hidden_states,
-                layer_past,
-                attention_mask,
-                layer_head_mask,
-                use_cache,
-                output_attentions,
-                image_hidden_states,
-                image_attention_mask,
-                layer_idx,
-                cross_layer_interval,
-                gated_cross_attn_layers,
-            ):
-                # TODO(aps): Add cross attention values to respective lists
-                # TODO(aps): Add xblock head mask support
-                if layer_idx % cross_layer_interval == 0:
-                    xblock = gated_cross_attn_layers[layer_idx // cross_layer_interval]
-                    outputs = xblock(
-                        hidden_states,
-                        attention_mask=attention_mask,
-                        image_hidden_states=image_hidden_states,
-                        image_attention_mask=image_attention_mask,
-                        use_cache=use_cache,
-                        output_attentions=output_attentions,
-                    )
-                    hidden_states = outputs[0]
-
-                outputs = main_block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    head_mask=layer_head_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-                return outputs
-
-            if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning_once(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    vblock,
-                    block,
-                    hidden_states,
-                    layer_past,
-                    attention_mask,
-                    head_mask[i],
-                    use_cache,
-                    output_attentions,
-                    image_hidden_states,
-                    image_attention_mask,
-                    i,
-                    self.cross_layer_interval,
-                    self.gated_cross_attn_layers,
-                )
-            else:
-                outputs = vblock(
-                    block,
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    layer_head_mask=head_mask[i],
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    layer_idx=i,
-                    image_hidden_states=image_hidden_states,
-                    image_attention_mask=image_attention_mask,
-                    cross_layer_interval=self.cross_layer_interval,
-                    gated_cross_attn_layers=self.gated_cross_attn_layers,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
-
-        hidden_states = self.ln_f(hidden_states)
-
-        hidden_states = hidden_states.view(output_shape)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
-                if v is not None
-            )
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    The GPT Neo Model transformer with a language modeling head on top (linear layer with weights tied to the input
-    embeddings).
-    """,
-    GPT_NEO_START_DOCSTRING,
-)
-class VGPTNeoForCausalLM(VGPTNeoPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"h\.\d+\.attn\.masked_bias",
-        r"lm_head.weight",
-        r"h\.\d+\.attn\.attention\.bias",
-    ]
-    _keys_to_ignore_on_save = [r"lm_head.weight"]
-
-    def __init__(self, config, vision_model=None):
-        super().__init__(config)
-        self.transformer = VGPTNeoModel(config, vision_model=vision_model)
-        self.lm_head = DecoupledLinear(
-            in_features=config.hidden_size,
-            out_features=config.vocab_size,
-            out_additional_features=config.additional_vocab_size,
-            bias=False,
-            partially_freeze=config.freeze_lm_head,
-        )
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def tie_weights(self):
-        """
-        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
-        """
-        output_embeddings = self.get_output_embeddings()
-        input_embeddings = self.get_input_embeddings()
-
-        if getattr(self.config, "tie_word_embeddings", True):
-            output_embeddings.weight = input_embeddings.weight
-            if input_embeddings.num_additional_embeddings > 0:
-                assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
-                output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
-
-        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
-            output_embeddings.out_features = input_embeddings.num_embeddings
-            if hasattr(output_embeddings, "out_additional_features") and hasattr(
-                input_embeddings, "num_additional_embeddings"
-            ):
-                output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
-
-    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
-        return prepare_inputs_for_generation(input_ids, past=past, **kwargs)
-
-    @staticmethod
-    def _expand_inputs_for_generation(
-        *args,
-        **model_kwargs,
-    ):
-        return expand_inputs_for_generation(*args, **model_kwargs)
-
-    @staticmethod
-    def _update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
-        return update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder)
-
-    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=CausalLMOutputWithCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_embeddings: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        crossblock_head_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            pixel_values=pixel_values,
-            image_embeddings=image_embeddings,
-            image_attention_mask=image_attention_mask,
-            crossblock_head_mask=crossblock_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Compute loss in fp32 to match with mesh-tf version
-            # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
-            lm_logits = lm_logits.to(torch.float32)
-
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
-                shift_logits = lm_logits[..., :-1, :][shift_attention_mask != 0].contiguous()
-                shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
-            else:
-                shift_logits = lm_logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-            lm_logits = lm_logits.to(hidden_states.dtype)
-            loss = loss.to(hidden_states.dtype)
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-            cross_attentions=transformer_outputs.cross_attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
-        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-        """
-        return tuple(
-            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
-            for layer_past in past
-        )
-
-    def get_model_tflops_per_batch_per_gpu(self, hparams, data_param, tokenizer, max_num_images):
-        config_vl_model = self.config
-
-        language_embed_size = config_vl_model.hidden_size
-        vision_config = self.transformer.vision_model.config
-        num_language_layers = config_vl_model.num_layers
-        ffn_inner_size = (
-            config_vl_model.intermediate_size
-            if config_vl_model.intermediate_size is not None
-            else 4 * config_vl_model.hidden_size
-        )
-
-        # Get vision model blocks infos
-        vision_patch_size = vision_config.patch_size
-        vision_hidden_size = vision_config.hidden_size
-        num_vision_layers = vision_config.num_hidden_layers
-        # The +1 is for the CLS token
-        single_image_seq_len = (vision_config.image_size // vision_patch_size) ** 2 + 1
-        vision_exp_factor = vision_config.intermediate_size // vision_hidden_size
-
-        # Get language and cross-att blocks infos
-        num_cross_attn_layers = num_language_layers // config_vl_model.cross_layer_interval
-        language_seq_len = data_param.max_seq_len
-        language_exp_factor = (ffn_inner_size // language_embed_size) if ffn_inner_size is not None else 4
-        cross_att_exp_factor = (ffn_inner_size // language_embed_size) if ffn_inner_size is not None else 4
-        k_v_cross_attn_seq_len = (
-            (self.config.resampler_n_latents * max_num_images)
-            if self.config.use_resampler
-            else (single_image_seq_len * max_num_images)
-        )
-
-        language_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_language_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=language_seq_len,
-            k_seq_len=language_seq_len,
-            hidden_size=language_embed_size,
-            kv_in_dim=language_embed_size,
-            ff_exp_factor=language_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=False,
-            vocab_size=tokenizer.vocab_size,
-            count_backward=True,  # Always True regardless of freezing, because gradients are computed for cross-attentions
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        cross_attention_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_cross_attn_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=language_seq_len,
-            k_seq_len=k_v_cross_attn_seq_len,
-            hidden_size=language_embed_size,
-            kv_in_dim=vision_hidden_size,
-            ff_exp_factor=cross_att_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=False,
-            vocab_size=None,
-            count_backward=True,
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        vision_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_vision_layers,
-            batch_size=hparams.batch_size_per_gpu * max_num_images,
-            q_seq_len=single_image_seq_len,
-            k_seq_len=single_image_seq_len,
-            hidden_size=vision_hidden_size,
-            kv_in_dim=vision_hidden_size,
-            ff_exp_factor=vision_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=False,
-            vocab_size=None,
-            count_backward=not hparams.model_params["freeze_vision_layers"],
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        if self.config.use_resampler:
-            perceiver_tflops_per_batch_per_gpu = compute_perceiver_tflops_per_batch_per_gpu(
-                num_layers=self.config.resampler_depth,
-                batch_size=hparams.batch_size_per_gpu * max_num_images,
-                q_seq_len=self.config.resampler_n_latents,
-                vision_embed_seq_len=single_image_seq_len,
-                q_k_v_input_dim=vision_hidden_size,
-                attention_hidden_size=self.config.resampler_n_heads * self.config.resampler_head_dim,
-                ff_exp_factor=cross_att_exp_factor,
-                count_backward=True,
-                use_grad_checkpointing=hparams.gradient_checkpointing,
-            )
-            flop_count = (
-                language_tflops_per_batch_per_gpu
-                + cross_attention_tflops_per_batch_per_gpu
-                + vision_tflops_per_batch_per_gpu
-                + perceiver_tflops_per_batch_per_gpu
-            )
-        else:
-            flop_count = (
-                language_tflops_per_batch_per_gpu
-                + cross_attention_tflops_per_batch_per_gpu
-                + vision_tflops_per_batch_per_gpu
-            )
-        return flop_count
diff --git a/m4/models/vllama/configuration_vllama.py b/m4/models/vllama/configuration_vllama.py
deleted file mode 100644
index ee358f4176a017ca05106c49aa55d8bfebe73e12..0000000000000000000000000000000000000000
--- a/m4/models/vllama/configuration_vllama.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" LLaMA model configuration"""
-import os
-from typing import Tuple, Union
-
-from transformers import AutoConfig
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-
-
-class VLlamaConfig(PretrainedConfig):
-    r"""
-    TODO: update docstring with respect to new arguments
-
-    This is the configuration class to store the configuration of a [`~LlamaModel`]. It is used to instantiate an LLaMA
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the LLaMA-7B.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`~LlamaModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        Example:
-
-    ```python
-    >>> from transformers import LlamaModel, LlamaConfig
-
-    >>> # Initializing a LLaMA llama-7b style configuration
-    >>> configuration = LlamaConfig()
-
-    >>> # Initializing a model from the llama-7b style configuration
-    >>> model = LlamaModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "vllama"
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        additional_vocab_size=0,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        dropout=0.0,
-        hidden_act="silu",
-        initializer_range=0.02,
-        alpha_initializer="ones",
-        alphas_initializer_range=0.0,
-        alpha_type="vector",
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        cross_layer_interval=1,
-        cross_layer_activation_function="swiglu",
-        qk_layer_norms=False,
-        qk_layer_norms_perceiver=False,
-        freeze_text_layers=True,
-        freeze_text_module_exceptions=[],
-        freeze_lm_head=False,
-        freeze_vision_layers=True,
-        freeze_vision_module_exceptions=[],
-        vision_model_name="google/vit-base-patch16-224",
-        vision_model_params="{}",
-        vision_embed_dim=768,
-        vision_image_size=224,
-        use_resampler=False,
-        resampler_n_latents=64,
-        resampler_depth=6,
-        resampler_n_heads=16,
-        resampler_head_dim=96,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.additional_vocab_size = additional_vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.dropout = dropout
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.alpha_initializer = alpha_initializer
-        self.alphas_initializer_range = alphas_initializer_range
-        self.alpha_type = alpha_type
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-        self.cross_layer_interval = cross_layer_interval
-        self.cross_layer_activation_function = cross_layer_activation_function
-        self.qk_layer_norms = qk_layer_norms
-        self.qk_layer_norms_perceiver = qk_layer_norms_perceiver
-        self.freeze_vision_layers = freeze_vision_layers
-        self.vision_model_name = vision_model_name
-        self.vision_model_params = vision_model_params
-
-        self.freeze_text_layers = freeze_text_layers
-        self.freeze_text_module_exceptions = freeze_text_module_exceptions
-        self.freeze_vision_module_exceptions = freeze_vision_module_exceptions
-        self.freeze_lm_head = freeze_lm_head
-
-        self.vision_embed_dim = vision_embed_dim
-        self.vision_image_size = vision_image_size
-
-        # Resampler params
-        self.use_resampler = use_resampler
-        self.resampler_n_latents = resampler_n_latents
-        self.resampler_depth = resampler_depth
-        self.resampler_n_heads = resampler_n_heads
-        self.resampler_head_dim = resampler_head_dim
-
-        # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
-        # PretrainedConfig.from_dict first instantiates the class with the config dict and only then
-        # updates the config object with `kwargs` from from_pretrained, so during the instantiation
-        # of this object many attributes have default values and haven't yet been overridden.
-        # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.
-
-    def check_compatibilities(self):
-        vision_model_params = eval(self.vision_model_params)
-        config = AutoConfig.from_pretrained(self.vision_model_name, **vision_model_params)
-        if hasattr(config, "vision_config"):
-            vision_config = config.vision_config
-        else:
-            vision_config = config
-        vision_embed_dim = vision_config.hidden_size
-        if self.vision_embed_dim != vision_embed_dim:
-            raise ValueError(
-                f"vision_embed_dim ({self.vision_embed_dim}) must match the hidden size of the vision model"
-                f" ({vision_embed_dim})"
-            )
-        vision_image_size = vision_config.image_size
-        if self.vision_image_size != vision_image_size:
-            raise ValueError(
-                f"vision_image_size ({self.vision_image_size}) must match the hidden size of the vision model"
-                f" ({vision_image_size})"
-            )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        outputs = super(VLlamaConfig, cls).from_pretrained(pretrained_model_name_or_path, **kwargs)
-        if isinstance(outputs, Tuple):
-            # When called with return_unused_kwargs=True, the first item will be the config
-            outputs[0].check_compatibilities()
-        else:
-            outputs.check_compatibilities()
-        return outputs
diff --git a/m4/models/vllama/make_tiny_llama.py b/m4/models/vllama/make_tiny_llama.py
deleted file mode 100644
index 82ec3dfe5ce42236bdf19e8178c2e15c44c49d40..0000000000000000000000000000000000000000
--- a/m4/models/vllama/make_tiny_llama.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env python
-
-# This script creates a super tiny model that is useful inside tests, when we just want to test that
-# the machinery works, without needing to check the quality of the outcomes.
-#
-# usage: adjust the configs if wanted, but otherwise just run the script
-
-from pathlib import Path
-
-from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
-
-
-mname_tiny = "tiny-random-LlamaForCausalLM"
-
-path = Path(mname_tiny)
-path.mkdir(parents=True, exist_ok=True)
-
-config = LlamaConfig()
-config.update(
-    dict(
-        vocab_size=32000,
-        hidden_size=16,
-        intermediate_size=16 * 4,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-    )
-)
-model = LlamaForCausalLM(config)
-tokenizer = LlamaTokenizer.from_pretrained("path_to_llama_7b")
-
-# Test w/ one text
-query = "This is a test"
-query_tokens = tokenizer(query, return_tensors="pt")
-
-input = {
-    "input_ids": query_tokens["input_ids"],
-    "attention_mask": query_tokens["attention_mask"],
-}
-
-out_gen = model.generate(**input)
-text = tokenizer.batch_decode(out_gen)
-
-# Save model + config + tokenizer
-model.half()  # makes it smaller
-model.save_pretrained(path)
-tokenizer.save_pretrained(path)
-
-# test we can load it back
-model = LlamaForCausalLM.from_pretrained(path)
-
-print(f"Generated {mname_tiny} - Upload the generated folder to the hub")
diff --git a/m4/models/vllama/make_tiny_model.py b/m4/models/vllama/make_tiny_model.py
deleted file mode 100644
index aad2bd78421e09c32bf4cb085abbf038c1541726..0000000000000000000000000000000000000000
--- a/m4/models/vllama/make_tiny_model.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env python
-
-# This script creates a super tiny model that is useful inside tests, when we just want to test that
-# the machinery works, without needing to check the quality of the outcomes.
-#
-# usage: adjust the configs if wanted, but otherwise just run the script
-
-from pathlib import Path
-from types import SimpleNamespace
-
-import torchvision.transforms as transforms
-from PIL import Image
-
-from m4.models.vllama.modeling_vllama import VLlamaConfig, VLlamaForCausalLM
-from m4.training.packing import image_attention_mask_for_packed_input_ids, incremental_to_binary_attention_mask
-from m4.training.utils import get_tokenizer
-
-
-mname_tiny = "tiny-random-vllama-clip"
-
-path = Path(mname_tiny)
-path.mkdir(parents=True, exist_ok=True)
-
-# from the hardcoded https://github.com/huggingface/m4/blob/adf102f0000cb2632cd8a3ebb87398c65e448a97/m4/training/main.py#L80
-additional_vocab_size = 2
-
-config = VLlamaConfig()
-config.update(
-    dict(
-        ffn_dim=64,
-        hidden_size=16,
-        max_position_embeddings=128,
-        num_attention_heads=4,
-        num_hidden_layers=2,
-        word_embed_proj_dim=16,
-        max_new_tokens=100,
-        use_resampler=True,
-        resampler_depth=2,
-        resampler_head_dim=8,
-        resampler_n_heads=2,
-        resampler_n_latents=16,
-        vision_embed_dim=32,
-        vision_image_size=30,
-        vision_model_name="hf-internal-testing/tiny-random-clip",
-        vision_model_params="{}",
-        vocab_size=32000,
-        additional_vocab_size=additional_vocab_size,
-    )
-)
-
-# print(config)
-# can now modify config to say tiny values
-
-model = VLlamaForCausalLM.from_config(config)
-# print(model.config)
-# print(model)
-
-tokenizer_config = dict(
-    tokenizer_add_special_tokens="{}",
-    tokenizer_add_tokens=(
-        '[AddedToken("<fake_token_around_image>", rstrip=False, lstrip=False), AddedToken("<image>", rstrip=False,'
-        " lstrip=False)]"
-    ),
-    tokenizer_name="HuggingFaceM4/huggy-llama-tokenizer-7b",
-    tokenizer_params='{"use_fast": True}',
-)
-tokenizer_config = SimpleNamespace(**tokenizer_config)
-# print(tokenizer_config)
-
-tokenizer = get_tokenizer(
-    tokenizer_name=tokenizer_config.tokenizer_name,
-    tokenizer_add_tokens=tokenizer_config.tokenizer_add_tokens,
-    tokenizer_add_special_tokens=tokenizer_config.tokenizer_add_special_tokens,
-    tokenizer_params=tokenizer_config.tokenizer_params,
-    additional_vocab_size=model.config.additional_vocab_size,
-    model_vocab_size=model.config.vocab_size,
-)
-assert "<image>" in tokenizer.get_vocab()
-
-# Test w/ one image and one text
-query = "<fake_token_around_image><image><fake_token_around_image>This is a picture of a cat."
-query_tokens = tokenizer(query, return_tensors="pt")
-
-num_images_per_ex = 1
-pixel_values = transforms.ToTensor()(Image.new("RGB", (30, 30))).repeat(1, 1, 1, 1).unsqueeze(0)
-image_attention_mask, _ = image_attention_mask_for_packed_input_ids(query_tokens["input_ids"], tokenizer)
-image_attention_mask = incremental_to_binary_attention_mask(image_attention_mask, num_classes=num_images_per_ex)
-
-input = {
-    "input_ids": query_tokens["input_ids"],
-    "attention_mask": query_tokens["attention_mask"],
-    "pixel_values": pixel_values,
-    "pixel_values": pixel_values,
-    "image_attention_mask": image_attention_mask,
-}
-# debug shapes
-# print(query_tokens["input_ids"].shape)
-# print(query_tokens["attention_mask"].shape)
-# print(pixel_values.shape)
-# print(image_attention_mask.shape)
-
-out_gen = model.generate(**input)
-text = tokenizer.batch_decode(out_gen)
-# print(text)
-
-# Save model + config + tokenizer
-model.half()  # makes it smaller
-model.save_pretrained(path)
-tokenizer.save_pretrained(path)
-
-# test we can load it back
-model = VLlamaForCausalLM.from_pretrained(path)
-
-print(f"Generated {mname_tiny} - Upload the generated folder to the hub")
diff --git a/m4/models/vllama/modeling_vllama.py b/m4/models/vllama/modeling_vllama.py
deleted file mode 100644
index 90b01ac23cea023e32fd3065f362fbe568da2217..0000000000000000000000000000000000000000
--- a/m4/models/vllama/modeling_vllama.py
+++ /dev/null
@@ -1,1260 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch LLaMA model."""
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.modeling_utils import PretrainedConfig
-from transformers.utils import (
-    ContextManagers,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
-
-from m4.models import DecoupledEmbedding, DecoupledLinear
-from m4.models.common import (
-    expand_inputs_for_generation,
-    prepare_inputs_for_generation,
-    update_model_kwargs_for_generation,
-)
-from m4.models.custom_modules import VLOOMPreTrainedModelBase
-from m4.models.perceiver.perceiver import PerceiverResampler
-from m4.models.vllama.configuration_vllama import VLlamaConfig
-from m4.training.utils import (
-    compute_perceiver_tflops_per_batch_per_gpu,
-    compute_tflops_per_batch_per_gpu,
-    deepspeed_gathered_parameters_context_manager,
-    freeze_model,
-)
-from m4.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "VLlamaConfig"
-
-
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class LlamaRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-            self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
-    gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
-    cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
-    sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        is_cross_attention: bool = False,
-        config: PretrainedConfig = None,
-        qk_layer_norms: bool = False,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.head_dim = hidden_size // num_heads
-        self.dropout = dropout
-
-        if (self.head_dim * num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {num_heads})."
-            )
-
-        self.is_cross_attention = is_cross_attention
-
-        if self.is_cross_attention:
-            kv_input_dim = self.hidden_size if not hasattr(config, "vision_embed_dim") else config.vision_embed_dim
-            self.q_proj = nn.Linear(
-                self.hidden_size,
-                num_heads * self.head_dim,
-                bias=False,
-            )
-            self.k_proj = nn.Linear(kv_input_dim, num_heads * self.head_dim, bias=False)
-            self.v_proj = nn.Linear(
-                kv_input_dim,
-                num_heads * self.head_dim,
-                bias=False,
-            )
-        else:
-            self.q_proj = nn.Linear(
-                self.hidden_size,
-                num_heads * self.head_dim,
-                bias=False,
-            )
-            self.k_proj = nn.Linear(
-                self.hidden_size,
-                num_heads * self.head_dim,
-                bias=False,
-            )
-            self.v_proj = nn.Linear(
-                self.hidden_size,
-                num_heads * self.head_dim,
-                bias=False,
-            )
-        self.o_proj = nn.Linear(
-            num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-        )
-        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim)
-
-        self.qk_layer_norms = qk_layer_norms
-        if self.qk_layer_norms:
-            self.q_layer_norm = LlamaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
-            self.k_layer_norm = LlamaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        is_cross_attention = self.is_cross_attention or key_value_states is not None
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        if not is_cross_attention:
-            key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-            value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        else:
-            _, kv_len, _ = key_value_states.size()  # Note that, in this case, `kv_len` == `kv_seq_len`
-            key_states = self.k_proj(key_value_states).view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
-            value_states = (
-                self.v_proj(key_value_states).view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
-            )
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        if not is_cross_attention:
-            cos, sin = self.rotary_emb(value_states, seq_len=max(kv_seq_len, q_len))
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        if self.qk_layer_norms:
-            query_states = self.q_layer_norm(query_states)
-            key_states = self.k_layer_norm(key_states)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        attn_output = nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.dropout,
-        )
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        attn_weights = None
-        logger.warning_once(
-            "attn_weights are not extracted in scaled_dot_product_attention. The model returns None instead"
-        )
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: VLlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            dropout=config.dropout,
-            config=config,
-        )
-        self.mlp = LlamaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.dropout = config.dropout
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-class VLlamaGatedCrossAttentionLayer(nn.Module):
-    def __init__(self, config: VLlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.cross_attn = LlamaAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            is_cross_attention=True,
-            dropout=config.dropout,
-            config=config,
-            qk_layer_norms=config.qk_layer_norms,
-        )
-        self.mlp = LlamaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.config = config.dropout
-
-        self.act_cross_attn = nn.Tanh()
-        self.act_dense = nn.Tanh()
-
-        if config.alpha_initializer == "zeros":
-            if config.alpha_type == "vector":
-                self.alpha_cross_attn = nn.Parameter(torch.zeros(1, 1, self.hidden_size))
-                self.alpha_dense = nn.Parameter(torch.zeros(1, 1, self.hidden_size))
-            elif config.alpha_type == "float":
-                self.alpha_cross_attn = nn.Parameter(torch.zeros(1))
-                self.alpha_dense = nn.Parameter(torch.zeros(1))
-            else:
-                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-        elif config.alpha_initializer == "ones":
-            if config.alpha_type == "vector":
-                self.alpha_cross_attn = nn.Parameter(torch.ones(1, 1, self.hidden_size))
-                self.alpha_dense = nn.Parameter(torch.ones(1, 1, self.hidden_size))
-            elif config.alpha_type == "float":
-                self.alpha_cross_attn = nn.Parameter(torch.ones(1))
-                self.alpha_dense = nn.Parameter(torch.ones(1))
-            else:
-                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-        elif config.alpha_initializer in {"normal", "gaussian", "random"}:
-            if config.alpha_type == "vector":
-                self.alpha_cross_attn = nn.Parameter(
-                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, self.hidden_size))
-                )
-                self.alpha_dense = nn.Parameter(
-                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, self.hidden_size))
-                )
-            elif config.alpha_type == "float":
-                self.alpha_cross_attn = nn.Parameter(
-                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1))
-                )
-                self.alpha_dense = nn.Parameter(torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1)))
-            else:
-                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-        else:
-            raise NotImplementedError(f"Alpha initialization scheme {config.alpha_initializer} not yet implemented!")
-
-        if not (hasattr(self, "alpha_cross_attn") and hasattr(self, "alpha_dense")):
-            raise ValueError("Alpha parameters not initialized correctly!")
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_hidden_states: Optional[torch.Tensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        if image_hidden_states is None:
-            raise ValueError(
-                "`image_hidden_states` is required for VLlama cross attention module which are visual features to be"
-                " conditioned on."
-            )
-
-        if past_key_value is not None:
-            raise NotImplementedError("Past key value states are not implemented for VLlama cross attention module.")
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.cross_attn(
-            hidden_states=hidden_states,
-            key_value_states=image_hidden_states,
-            attention_mask=image_attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
-        hidden_states = residual + self.act_cross_attn(self.alpha_cross_attn) * hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
-        hidden_states = residual + self.act_dense(self.alpha_dense) * hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`VLlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class VLlamaPreTrainedModel(VLOOMPreTrainedModelBase):
-    config_class = VLlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer", "VLlamaGatedCrossAttentionLayer"]
-    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
-
-    def _init_weights(self, module):
-        def init_a_linear(module, mean=0.0, std=self.config.initializer_range):
-            with ContextManagers(deepspeed_gathered_parameters_context_manager(module.weight, modify=True)):
-                module.weight.data.normal_(mean=mean, std=std)
-                if module.bias is not None:
-                    with ContextManagers(deepspeed_gathered_parameters_context_manager(module.bias, modify=True)):
-                        module.bias.data.zero_()
-
-        if isinstance(module, VLlamaGatedCrossAttentionLayer):
-            for sub_module_name, sub_module in module.named_modules():
-                if isinstance(sub_module, nn.Linear):
-                    if "down_proj" in sub_module_name:
-                        factor = 2 * self.config.num_hidden_layers
-                    else:
-                        factor = 1.0
-                    init_a_linear(sub_module, std=(0.4 / (sub_module.in_features * factor)) ** 0.5)
-        elif isinstance(module, PerceiverResampler):
-            with ContextManagers(deepspeed_gathered_parameters_context_manager(module.latents, modify=True)):
-                module.latents.data.normal_(mean=0.0, std=(1.0 / self.config.vision_embed_dim) ** 0.5)
-            for sub_module_name, sub_module in module.named_modules():
-                if isinstance(sub_module, nn.Linear):
-                    if "c_proj" in sub_module_name:
-                        factor = 2 * self.config.num_hidden_layers
-                    else:
-                        factor = 1.0
-                    init_a_linear(sub_module, std=(0.4 / (self.config.vision_embed_dim * factor)) ** 0.5)
-        elif isinstance(module, nn.Embedding):
-            with ContextManagers(deepspeed_gathered_parameters_context_manager(module.weight, modify=True)):
-                module.weight.data.normal_(mean=0.0, std=(1.0 / self.config.hidden_size) ** 0.5)
-                if module.padding_idx is not None:
-                    module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, DecoupledLinear):
-            if hasattr(module, "additional_fc"):
-                init_a_linear(module.additional_fc, std=(1.0 / (module.additional_fc.in_features)) ** 0.5)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, VLlamaModel):
-            module.gradient_checkpointing = value
-
-    @classmethod
-    def override_vision_model_wrapper(cls, model, config, vision_model_name, vision_model_params, torch_dtype):
-        # this can be called via from_pretrained from a class w/ head or w/o head so we extract the beheaded model version
-        beheaded_model = model.model if hasattr(model, "model") else model
-        cls.override_vision_model(beheaded_model, vision_model_name, vision_model_params, torch_dtype)
-        beheaded_model.freeze_relevant_params(config)
-
-
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class VLlamaModel(VLlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: VLlamaConfig
-    """
-
-    def __init__(self, config: VLlamaConfig, vision_model=None):
-        super().__init__(config)
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = DecoupledEmbedding(
-            num_embeddings=config.vocab_size,
-            num_additional_embeddings=config.additional_vocab_size,
-            embedding_dim=config.hidden_size,
-            partially_freeze=config.freeze_text_layers,
-            padding_idx=self.padding_idx,
-        )
-
-        # Load an uninitialized model and later in from_pretrained will load the pre-trained model -
-        # this solves the losing of weights in `from_pretrained` on the main model
-        self.vision_model = vision_model
-
-        # Perceiver Resampler
-        if config.use_resampler:
-            self.perceiver_resampler = PerceiverResampler(
-                self.config,
-                self.config.vision_embed_dim,
-                config.resampler_depth,
-                config.resampler_n_heads,
-                config.resampler_head_dim,
-                config.resampler_n_latents,
-            )
-
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-
-        self.cross_layer_interval = config.cross_layer_interval
-        num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
-        self.gated_cross_attn_layers = nn.ModuleList(
-            [VLlamaGatedCrossAttentionLayer(config) for _ in range(num_cross_layers)]
-        )
-        self.gradient_checkpointing = False
-
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        self.freeze_relevant_params(config)
-
-    def freeze_relevant_params(self, config=None):
-        if config is None:
-            config = self.config
-
-        if config.freeze_text_layers:
-            self.freeze_text_layers(config.freeze_text_module_exceptions)
-
-        if config.freeze_vision_layers:
-            freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)
-
-    def freeze_text_layers(self, module_exceptions):
-        for module in [self.layers, self.norm]:
-            freeze_model(module, module_exceptions=module_exceptions)
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_embeddings: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-        elif position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if pixel_values is not None and image_embeddings is not None:
-            raise ValueError("You cannot specify both pixel_values and image_embeddings at the same time")
-        elif pixel_values is not None:
-            pixel_values = pixel_values.to(dtype=self.dtype, device=input_ids.device)  # fp16 compatibility
-            batch_size, num_images = pixel_values.size(0), pixel_values.size(1)
-            pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
-            # Get sequence from the vision encoder
-            image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state
-        elif image_embeddings is not None:
-            batch_size, num_images, image_seq_len, image_hidden_size = image_embeddings.size()
-            image_hidden_states = image_embeddings.to(dtype=self.dtype, device=input_ids.device)
-            image_hidden_states = image_hidden_states.view(batch_size * num_images, image_seq_len, image_hidden_size)
-
-        if self.config.use_resampler:
-            image_hidden_states = self.perceiver_resampler(image_hidden_states)
-        image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2)
-        image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size)
-        # Make image_attention_mask compatible with hidden states
-        text_seq_len = image_attention_mask.size(1)
-        image_attention_mask = image_attention_mask.unsqueeze(-1)
-        image_attention_mask = image_attention_mask.repeat(1, 1, 1, image_seq_len)
-        image_attention_mask = image_attention_mask.view(batch_size, text_seq_len, num_images * image_seq_len)
-
-        if image_hidden_states is not None:
-            image_batch_size, image_sequence_length, _ = image_hidden_states.size()
-            image_hidden_shape = (image_batch_size, image_sequence_length)
-            if image_attention_mask is None:
-                image_attention_mask = torch.ones(image_hidden_shape, device=device)
-            image_attention_mask = self.invert_attention_mask(image_attention_mask)
-        else:
-            image_attention_mask = None
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-            )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            def vblock(
-                main_block,
-                hidden_states,
-                attention_mask,
-                position_ids,
-                past_key_value,
-                image_hidden_states,
-                image_attention_mask,
-                output_attentions,
-                use_cache,
-                layer_idx,
-                cross_layer_interval,
-                gated_cross_attn_layers,
-            ):
-                # TODO(ls): Add cross attention values to respective lists
-                if layer_idx % cross_layer_interval == 0:
-                    xblock = gated_cross_attn_layers[layer_idx // cross_layer_interval]
-                    outputs = xblock(
-                        hidden_states,
-                        attention_mask=attention_mask,
-                        image_hidden_states=image_hidden_states,
-                        image_attention_mask=image_attention_mask,
-                        output_attentions=output_attentions,
-                        use_cache=use_cache,
-                        past_key_value=None,  # not implemented
-                    )
-                    hidden_states = outputs[0]
-
-                layer_outputs = main_block(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-                return layer_outputs
-
-            if self.gradient_checkpointing and self.training:
-                past_key_value = None
-                if use_cache:
-                    logger.warning_once(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    vblock,
-                    decoder_layer,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_value,
-                    image_hidden_states,
-                    image_attention_mask,
-                    output_attentions,
-                    use_cache,
-                    idx,
-                    self.cross_layer_interval,
-                    self.gated_cross_attn_layers,
-                )
-            else:
-                layer_outputs = vblock(
-                    decoder_layer,
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    image_hidden_states=image_hidden_states,
-                    image_attention_mask=image_attention_mask,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    layer_idx=idx,
-                    cross_layer_interval=self.cross_layer_interval,
-                    gated_cross_attn_layers=self.gated_cross_attn_layers,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class VLlamaForCausalLM(VLlamaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-
-    def __init__(self, config, vision_model=None):
-        super().__init__(config)
-        self.model = VLlamaModel(config, vision_model=vision_model)
-
-        self.lm_head = DecoupledLinear(
-            in_features=config.hidden_size,
-            out_features=config.vocab_size,
-            out_additional_features=config.additional_vocab_size,
-            bias=False,
-            partially_freeze=config.freeze_lm_head,
-        )
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    def tie_weights(self):
-        """
-        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
-        """
-        output_embeddings = self.get_output_embeddings()
-        input_embeddings = self.get_input_embeddings()
-
-        if getattr(self.config, "tie_word_embeddings", True):
-            output_embeddings.weight = input_embeddings.weight
-            if input_embeddings.num_additional_embeddings > 0:
-                assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
-                output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
-
-        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
-            output_embeddings.out_features = input_embeddings.num_embeddings
-            if hasattr(output_embeddings, "out_additional_features") and hasattr(
-                input_embeddings, "num_additional_embeddings"
-            ):
-                output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_embeddings: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            pixel_values=pixel_values,
-            image_embeddings=image_embeddings,
-            image_attention_mask=image_attention_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
-                shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
-                shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
-            else:
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
-        inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
-        unwanted_kwargs = ["token_type_ids"]
-        for kwarg in unwanted_kwargs:
-            inputs.pop(kwarg, None)
-        return inputs
-
-    @staticmethod
-    def _expand_inputs_for_generation(
-        *args,
-        **model_kwargs,
-    ):
-        return expand_inputs_for_generation(*args, **model_kwargs)
-
-    @staticmethod
-    def _update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
-        return update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder)
-
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
-        return reordered_past
-
-    def get_model_tflops_per_batch_per_gpu(self, hparams, data_param, tokenizer, max_num_images):
-        config_vl_model = self.config
-
-        language_embed_size = config_vl_model.hidden_size
-        num_language_layers = config_vl_model.num_hidden_layers
-        ffn_inner_size = config_vl_model.intermediate_size
-
-        vision_config = self.model.vision_model.config
-        if hasattr(vision_config, "vision_config"):
-            vision_config = vision_config.vision_config
-
-        # Get vision model blocks infos
-        vision_patch_size = vision_config.patch_size
-        vision_hidden_size = vision_config.hidden_size
-        num_vision_layers = vision_config.num_hidden_layers
-        # The +1 is for the CLS token
-        single_image_seq_len = (vision_config.image_size // vision_patch_size) ** 2 + 1
-        vision_exp_factor = vision_config.intermediate_size // vision_hidden_size
-
-        # Get language and cross-att blocks infos
-        num_cross_attn_layers = num_language_layers // config_vl_model.cross_layer_interval
-        language_seq_len = data_param.max_seq_len
-        language_exp_factor = (ffn_inner_size // language_embed_size) if ffn_inner_size is not None else 4
-        cross_att_exp_factor = (ffn_inner_size // language_embed_size) if ffn_inner_size is not None else 4
-        k_v_cross_attn_seq_len = (
-            (self.config.resampler_n_latents * max_num_images)
-            if self.config.use_resampler
-            else (single_image_seq_len * max_num_images)
-        )
-
-        language_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_language_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=language_seq_len,
-            k_seq_len=language_seq_len,
-            hidden_size=language_embed_size,
-            kv_in_dim=language_embed_size,
-            ff_exp_factor=language_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=True,
-            vocab_size=tokenizer.vocab_size,
-            count_backward=True,  # Always True regardless of freezing, because gradients are computed for cross-attentions
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        cross_attention_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_cross_attn_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=language_seq_len,
-            k_seq_len=k_v_cross_attn_seq_len,
-            hidden_size=language_embed_size,
-            kv_in_dim=vision_hidden_size,
-            ff_exp_factor=cross_att_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=True,
-            vocab_size=None,
-            count_backward=True,
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        vision_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_vision_layers,
-            batch_size=hparams.batch_size_per_gpu * max_num_images,
-            q_seq_len=single_image_seq_len,
-            k_seq_len=single_image_seq_len,
-            hidden_size=vision_hidden_size,
-            kv_in_dim=vision_hidden_size,
-            ff_exp_factor=vision_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=False,
-            vocab_size=None,
-            count_backward=not hparams.model_params["freeze_vision_layers"],
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        if self.config.use_resampler:
-            perceiver_tflops_per_batch_per_gpu = compute_perceiver_tflops_per_batch_per_gpu(
-                num_layers=self.config.resampler_depth,
-                batch_size=hparams.batch_size_per_gpu * max_num_images,
-                q_seq_len=self.config.resampler_n_latents,
-                vision_embed_seq_len=single_image_seq_len,
-                q_k_v_input_dim=vision_hidden_size,
-                attention_hidden_size=self.config.resampler_n_heads * self.config.resampler_head_dim,
-                ff_exp_factor=cross_att_exp_factor,
-                count_backward=True,
-                use_grad_checkpointing=hparams.gradient_checkpointing,
-            )
-            flop_count = (
-                language_tflops_per_batch_per_gpu
-                + cross_attention_tflops_per_batch_per_gpu
-                + vision_tflops_per_batch_per_gpu
-                + perceiver_tflops_per_batch_per_gpu
-            )
-        else:
-            flop_count = (
-                language_tflops_per_batch_per_gpu
-                + cross_attention_tflops_per_batch_per_gpu
-                + vision_tflops_per_batch_per_gpu
-            )
-        return flop_count
diff --git a/m4/models/vopt/__init__.py b/m4/models/vopt/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/m4/models/vopt/configuration_vopt.py b/m4/models/vopt/configuration_vopt.py
deleted file mode 100644
index 83783c5f12399d6a7e1086cebcc0cb692abe6637..0000000000000000000000000000000000000000
--- a/m4/models/vopt/configuration_vopt.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Metaseq Authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" OPT model configuration"""
-import os
-from typing import Tuple, Union
-
-from transformers import AutoConfig
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-OPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/opt-125m": "https://huggingface.co/facebook/opt-125m/blob/main/config.json",
-    "facebook/opt-350m": "https://huggingface.co/facebook/opt-350m/blob/main/config.json",
-    "facebook/opt-1.3b": "https://huggingface.co/facebook/opt-1.3b/blob/main/config.json",
-    "facebook/opt-2.7b": "https://huggingface.co/facebook/opt-2.7b/blob/main/config.json",
-    "facebook/opt-6.7b": "https://huggingface.co/facebook/opt-6.7b/blob/main/config.json",
-    "facebook/opt-13b": "https://huggingface.co/facebook/opt-13b/blob/main/config.json",
-}
-
-
-class VOPTConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`OPTModel`]. It is used to instantiate a OPT model
-    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the OPT
-    [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    TODO: this doc is completely out of sync with the actual args
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 50272):
-            Vocabulary size of the OPT model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`OPTModel`]
-        additional_vocab_size (`int`, *optional`, defaults to 0):
-            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
-            are always trainable whereas regular vocab tokens can be frozen or not.
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of decoder layers.
-        ffn_dim (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        do_layer_norm_before (`bool`, *optional*, defaults to `True`):
-            Whether to perform layer normalization before the attention block.
-        word_embed_proj_dim (`int`, *optional*):
-            `word_embed_proj_dim` can be set to down-project word embeddings, *e.g.* `opt-350m`. Defaults to
-            `hidden_size`.
-        dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
-            details.
-        init_std (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        alpha_initializer (`str`, *optional*, defaults to `"ones"`):
-            Initialization type for the alphas.
-        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
-            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross Attention.
-        alpha_type (`str`, *optional*, defaults to `"vector"`):
-            Whether the gating alphas should be vectors or single floats.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-        cross_layer_interval (`int`, *optional*, default to 1)
-            Interval for cross attention (from text to image) layers.
-    Example:
-
-    ```python
-    >>> from transformers import OPTModel, OPTConfig
-
-    >>> # Initializing a OPT facebook/opt-large style configuration
-    >>> configuration = OPTConfig()
-
-    >>> # Initializing a model from the facebook/opt-large style configuration
-    >>> model = OPTModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "vopt"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=50272,
-        additional_vocab_size=0,
-        hidden_size=768,
-        num_hidden_layers=12,
-        ffn_dim=3072,
-        max_position_embeddings=2048,
-        do_layer_norm_before=True,
-        _remove_final_layer_norm=False,
-        word_embed_proj_dim=None,
-        dropout=0.1,
-        attention_dropout=0.0,
-        num_attention_heads=12,
-        activation_function="relu",
-        layerdrop=0.0,
-        init_std=0.02,
-        alpha_initializer="ones",
-        alphas_initializer_range=0.0,
-        alpha_type="vector",
-        use_cache=True,
-        pad_token_id=1,
-        bos_token_id=2,
-        eos_token_id=2,
-        cross_layer_interval=1,
-        cross_layer_activation_function="swiglu",
-        normformer_layer_norms=False,
-        qk_layer_norms=False,
-        rms_norm=False,
-        qk_layer_norms_perceiver=False,
-        tie_word_embeddings=False,
-        freeze_text_layers=True,
-        freeze_text_module_exceptions=[],
-        freeze_lm_head=False,
-        freeze_vision_layers=True,
-        freeze_vision_module_exceptions=[],
-        vision_model_name="google/vit-base-patch16-224",
-        vision_model_params="{}",
-        vision_embed_dim=768,
-        vision_image_size=224,
-        image_token_index=50257,  # TODO: change this to right value
-        use_resampler=False,
-        resampler_n_latents=64,
-        resampler_depth=6,
-        resampler_n_heads=16,
-        resampler_head_dim=96,
-        **kwargs,
-    ):
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-        self.vocab_size = vocab_size
-        self.additional_vocab_size = additional_vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.num_attention_heads = num_attention_heads
-        self.word_embed_proj_dim = word_embed_proj_dim if word_embed_proj_dim is not None else hidden_size
-        self.ffn_dim = ffn_dim
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_function = activation_function
-        self.init_std = init_std
-        self.alpha_initializer = alpha_initializer
-        self.alphas_initializer_range = alphas_initializer_range
-        self.alpha_type = alpha_type
-        self.layerdrop = layerdrop
-        self.use_cache = use_cache
-        self.do_layer_norm_before = do_layer_norm_before
-
-        # Note that the only purpose of `_remove_final_layer_norm` is to keep backward compatibility
-        # with checkpoints that have been fine-tuned before transformers v4.20.1
-        # see https://github.com/facebookresearch/metaseq/pull/164
-        self._remove_final_layer_norm = _remove_final_layer_norm
-
-        self.cross_layer_interval = cross_layer_interval
-        self.cross_layer_activation_function = cross_layer_activation_function
-        self.normformer_layer_norms = normformer_layer_norms
-        self.qk_layer_norms = qk_layer_norms
-        self.rms_norm = rms_norm
-        self.qk_layer_norms_perceiver = qk_layer_norms_perceiver
-        self.freeze_vision_layers = freeze_vision_layers
-        self.vision_model_name = vision_model_name
-        self.vision_model_params = vision_model_params
-
-        self.tie_word_embeddings = tie_word_embeddings
-        self.freeze_text_layers = freeze_text_layers
-        self.freeze_text_module_exceptions = freeze_text_module_exceptions
-        self.freeze_vision_module_exceptions = freeze_vision_module_exceptions
-        self.freeze_lm_head = freeze_lm_head
-        self.image_token_index = image_token_index
-
-        self.vision_embed_dim = vision_embed_dim
-        self.vision_image_size = vision_image_size
-
-        # Resampler params
-        self.use_resampler = use_resampler
-        self.resampler_n_latents = resampler_n_latents
-        self.resampler_depth = resampler_depth
-        self.resampler_n_heads = resampler_n_heads
-        self.resampler_head_dim = resampler_head_dim
-
-        # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
-        # PretrainedConfig.from_dict first instantiates the class with the config dict and only then
-        # updates the config object with `kwargs` from from_pretrained, so during the instantiation
-        # of this object many attributes have default values and haven't yet been overridden.
-        # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.
-
-    def check_compatibilities(self):
-        vision_model_params = eval(self.vision_model_params)
-        config = AutoConfig.from_pretrained(self.vision_model_name, **vision_model_params)
-        if hasattr(config, "vision_config"):
-            vision_config = config.vision_config
-        else:
-            vision_config = config
-        vision_embed_dim = vision_config.hidden_size
-        if self.vision_embed_dim != vision_embed_dim:
-            raise ValueError(
-                f"vision_embed_dim ({self.vision_embed_dim}) must match the hidden size of the vision model"
-                f" ({vision_embed_dim})"
-            )
-        vision_image_size = vision_config.image_size
-        if self.vision_image_size != vision_image_size:
-            raise ValueError(
-                f"vision_image_size ({self.vision_image_size}) must match the hidden size of the vision model"
-                f" ({vision_image_size})"
-            )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        outputs = super(VOPTConfig, cls).from_pretrained(pretrained_model_name_or_path, **kwargs)
-        if isinstance(outputs, Tuple):
-            # When called with return_unused_kwargs=True, the first item will be the config
-            outputs[0].check_compatibilities()
-        else:
-            outputs.check_compatibilities()
-        return outputs
diff --git a/m4/models/vopt/make_tiny_model.py b/m4/models/vopt/make_tiny_model.py
deleted file mode 100644
index 8b071a2c85dc1eacd92d9356d502074829228d72..0000000000000000000000000000000000000000
--- a/m4/models/vopt/make_tiny_model.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env python
-
-# This script creates a super tiny model that is useful inside tests, when we just want to test that
-# the machinery works, without needing to check the quality of the outcomes.
-#
-# usage: adjust the configs if wanted, but otherwise just run the script
-
-from pathlib import Path
-from types import SimpleNamespace
-
-import torchvision.transforms as transforms
-from PIL import Image
-
-from m4.models.vopt.modeling_vopt import VOPTConfig, VOPTForCausalLM
-from m4.training.packing import image_attention_mask_for_packed_input_ids, incremental_to_binary_attention_mask
-from m4.training.utils import get_tokenizer
-
-
-mname_tiny = "tiny-random-vopt-clip"
-
-path = Path(mname_tiny)
-path.mkdir(parents=True, exist_ok=True)
-
-# from the hardcoded https://github.com/huggingface/m4/blob/adf102f0000cb2632cd8a3ebb87398c65e448a97/m4/training/main.py#L80
-additional_vocab_size = 2
-
-config = VOPTConfig()
-config.update(
-    dict(
-        ffn_dim=64,
-        hidden_size=16,
-        max_position_embeddings=128,
-        num_attention_heads=4,
-        num_hidden_layers=2,
-        word_embed_proj_dim=16,
-        max_new_tokens=100,
-        use_resampler=True,
-        resampler_depth=2,
-        resampler_head_dim=8,
-        resampler_n_heads=2,
-        resampler_n_latents=16,
-        vision_embed_dim=32,
-        vision_image_size=30,
-        vision_model_name="hf-internal-testing/tiny-random-clip",
-        vision_model_params="{}",
-        vocab_size=50265,
-        additional_vocab_size=additional_vocab_size,
-    )
-)
-
-# print(config)
-# can now modify config to say tiny values
-
-model = VOPTForCausalLM.from_config(config)
-# print(model.config)
-# print(model)
-
-tokenizer_config = dict(
-    tokenizer_add_special_tokens="{}",
-    tokenizer_add_tokens=(
-        '[AddedToken("<fake_token_around_image>", rstrip=False, lstrip=False), AddedToken("<image>", rstrip=False,'
-        " lstrip=False)]"
-    ),
-    tokenizer_name="facebook/opt-13b",
-    tokenizer_params='{"use_fast":True}',
-)
-tokenizer_config = SimpleNamespace(**tokenizer_config)
-# print(tokenizer_config)
-
-tokenizer = get_tokenizer(
-    tokenizer_name=tokenizer_config.tokenizer_name,
-    tokenizer_add_tokens=tokenizer_config.tokenizer_add_tokens,
-    tokenizer_add_special_tokens=tokenizer_config.tokenizer_add_special_tokens,
-    tokenizer_params=tokenizer_config.tokenizer_params,
-    additional_vocab_size=model.config.additional_vocab_size,
-    model_vocab_size=model.config.vocab_size,
-)
-assert "<image>" in tokenizer.get_vocab()
-
-# Test w/ one image and one text
-query = "<fake_token_around_image><image><fake_token_around_image>This is a picture of a cat."
-query_tokens = tokenizer(query, return_tensors="pt")
-
-num_images_per_ex = 1
-pixel_values = transforms.ToTensor()(Image.new("RGB", (30, 30))).repeat(1, 1, 1, 1).unsqueeze(0)
-image_attention_mask, _ = image_attention_mask_for_packed_input_ids(query_tokens["input_ids"], tokenizer)
-image_attention_mask = incremental_to_binary_attention_mask(image_attention_mask, num_classes=num_images_per_ex)
-
-input = {
-    "input_ids": query_tokens["input_ids"],
-    "attention_mask": query_tokens["attention_mask"],
-    "pixel_values": pixel_values,
-    "pixel_values": pixel_values,
-    "image_attention_mask": image_attention_mask,
-}
-# debug shapes
-# print(query_tokens["input_ids"].shape)
-# print(query_tokens["attention_mask"].shape)
-# print(pixel_values.shape)
-# print(image_attention_mask.shape)
-
-out_gen = model.generate(**input)
-text = tokenizer.batch_decode(out_gen)
-# print(text)
-
-# Save model + config + tokenizer
-model.half()  # makes it smaller
-model.save_pretrained(path)
-tokenizer.save_pretrained(path)
-
-# test we can load it back
-model = VOPTForCausalLM.from_pretrained(path)
-
-print(f"Generated {mname_tiny} - Upload the generated folder to the hub")
diff --git a/m4/models/vopt/modeling_vopt.py b/m4/models/vopt/modeling_vopt.py
deleted file mode 100644
index 09f1c5a11218215ea438edef19d177199132c9ee..0000000000000000000000000000000000000000
--- a/m4/models/vopt/modeling_vopt.py
+++ /dev/null
@@ -1,1513 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch OPT model."""
-import random
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.utils import (
-    ContextManagers,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
-
-from m4.models import DecoupledEmbedding, DecoupledLinear
-from m4.models.common import (
-    expand_inputs_for_generation,
-    prepare_inputs_for_generation,
-    update_model_kwargs_for_generation,
-)
-from m4.models.custom_modules import VLOOMPreTrainedModelBase
-from m4.models.perceiver.perceiver import PerceiverResampler
-from m4.models.vopt.configuration_vopt import VOPTConfig
-from m4.training.utils import (
-    compute_perceiver_tflops_per_batch_per_gpu,
-    compute_tflops_per_batch_per_gpu,
-    deepspeed_gathered_parameters_context_manager,
-    freeze_model,
-)
-from m4.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "facebook/opt-350m"
-_CONFIG_FOR_DOC = "VOPTConfig"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
-
-# Base model docstring
-_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
-
-# SequenceClassification docstring
-_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "ArthurZ/opt-350m-dummy-sc"
-_SEQ_CLASS_EXPECTED_LOSS = 1.71
-_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_0'"
-
-
-OPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "facebook/opt-125m",
-    "facebook/opt-350m",
-    "facebook/opt-1.3b",
-    "facebook/opt-2.7b",
-    "facebook/opt-6.7b",
-    "facebook/opt-13b",
-    "facebook/opt-30b",
-    # See all OPT models at https://huggingface.co/models?filter=opt
-]
-
-
-class SwiGLUActivation(nn.Module):
-    def __init__(self, in_features: int, out_features: int):
-        super().__init__()
-        self.gate = nn.Linear(in_features, out_features, bias=False)
-
-    def forward(self, hidden_states_to_gate, hidden_states):
-        gate = self.gate(hidden_states)
-        return nn.functional.silu(gate) * hidden_states_to_gate
-
-
-# Taken from LLaMA codebase
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-
-
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class OPTLearnedPositionalEmbedding(nn.Embedding):
-    """
-    This module learns positional embeddings up to a fixed maximum size.
-    """
-
-    def __init__(self, num_embeddings: int, embedding_dim: int):
-        # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models don't have this hack
-        self.offset = 2
-        super().__init__(num_embeddings + self.offset, embedding_dim)
-
-    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
-        """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        attention_mask = attention_mask.long()
-
-        # create positions depending on attention_mask
-        positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1
-
-        # cut positions if `past_key_values_length` is > 0
-        positions = positions[:, past_key_values_length:]
-
-        return super().forward(positions + self.offset)
-
-
-class OPTAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        is_decoder: bool = False,
-        bias: bool = True,
-        is_cross_attention=False,
-        config=None,
-        qk_layer_norms=False,
-    ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-
-        if (self.head_dim * num_heads) != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
-                f" and `num_heads`: {num_heads})."
-            )
-        self.scaling = self.head_dim**-0.5
-        self.is_decoder = is_decoder
-
-        self.is_cross_attention = is_cross_attention
-
-        if self.is_cross_attention:
-            kv_input_dim = self.hidden_size if not hasattr(config, "vision_embed_dim") else config.vision_embed_dim
-            self.k_proj = nn.Linear(kv_input_dim, embed_dim, bias=bias)
-            self.v_proj = nn.Linear(kv_input_dim, embed_dim, bias=bias)
-
-            self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        else:
-            self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-            self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-            self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-
-        self.qk_layer_norms = qk_layer_norms
-        if self.qk_layer_norms and config.rms_norm:
-            self.q_layer_norm = RMSNorm(self.head_dim, eps=1e-6)
-            self.k_layer_norm = RMSNorm(self.head_dim, eps=1e-6)
-        elif self.qk_layer_norms:
-            self.q_layer_norm = nn.LayerNorm(self.head_dim)
-            self.k_layer_norm = nn.LayerNorm(self.head_dim)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = self.is_cross_attention or key_value_states is not None
-
-        bsz, tgt_len, _ = hidden_states.size()
-
-        # get query proj
-        query_states = self._shape(self.q_proj(hidden_states), -1, bsz)
-        # get key, value proj
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        if self.qk_layer_norms:
-            query_states = self.q_layer_norm(query_states)
-            key_states = self.k_layer_norm(key_states)
-
-        src_len = key_states.size(2)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-                )
-        if layer_head_mask is not None:
-            if layer_head_mask.size() != (self.num_heads,):
-                raise ValueError(
-                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                    f" {layer_head_mask.size()}"
-                )
-            attention_mask = attention_mask.expand(-1, self.num_heads, -1, -1)
-            attention_mask = attention_mask + layer_head_mask.view(1, -1, 1, 1)
-
-        attn_output = nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.dropout,
-        )
-
-        attn_weights_reshaped = None
-        logger.warning_once(
-            "attn_weights are not extracted in scaled_dot_product_attention. The model returns None instead"
-        )
-        attn_output = attn_output.transpose(1, 2)
-
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights_reshaped, past_key_value
-
-
-class OPTDecoderLayer(nn.Module):
-    def __init__(self, config: VOPTConfig):
-        super().__init__()
-        self.embed_dim = config.hidden_size
-        self.self_attn = OPTAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.num_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-            config=config,
-        )
-        self.do_layer_norm_before = config.do_layer_norm_before
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim)
-        self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
-                `(encoder_attention_heads,)`.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
-        if self.do_layer_norm_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            past_key_value=past_key_value,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-
-        # 350m applies layer norm AFTER attention
-        if not self.do_layer_norm_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Fully Connected
-        hidden_states_shape = hidden_states.shape
-        hidden_states = hidden_states.reshape(-1, hidden_states.size(-1))
-        residual = hidden_states
-
-        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
-        if self.do_layer_norm_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-        hidden_states = (residual + hidden_states).view(hidden_states_shape)
-
-        # 350m applies layer norm AFTER attention
-        if not self.do_layer_norm_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-class VOPTGatedAttentionLayer(nn.Module):
-    def __init__(self, config: VOPTConfig):
-        """
-        Note: Based on `tr_101_cm401xPMD09_nobias`, setting the biases to False in all of the nn.Linear for the gated cross attention.
-        Provide a small stability gain at opt-13b scale.
-        """
-        super().__init__()
-        self.embed_dim = config.hidden_size
-        self.cross_attn = OPTAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.num_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-            config=config,
-            is_cross_attention=True,
-            bias=False,
-            qk_layer_norms=config.qk_layer_norms,
-        )
-        self.do_layer_norm_before = config.do_layer_norm_before
-        self.normformer_layer_norms = config.normformer_layer_norms
-        self.dropout = config.dropout
-        if config.cross_layer_activation_function == "swiglu":
-            # We cannot put `SwiGLUActivation` in `ACT2FN` because it takes two arguments (`in_features` and
-            # `out_features`) that we don't know until entering this module.
-            self.activation_fn = SwiGLUActivation(self.embed_dim, config.ffn_dim)
-        else:
-            self.activation_fn = ACT2FN[config.cross_layer_activation_function]
-
-        if config.rms_norm:
-            self.self_attn_layer_norm = RMSNorm(self.embed_dim, eps=1e-6)
-        else:
-            self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-
-        if self.normformer_layer_norms:
-            self.self_attn_post_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=False)
-        self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=False)
-
-        if config.rms_norm:
-            self.final_layer_norm = RMSNorm(self.embed_dim, eps=1e-6)
-        else:
-            self.final_layer_norm = nn.LayerNorm(self.embed_dim)
-
-        if self.normformer_layer_norms:
-            self.mlp_post_layer_norm = nn.LayerNorm(config.ffn_dim)
-
-        self.act_cross_attn = nn.Tanh()
-        self.act_dense = nn.Tanh()
-
-        if config.alpha_initializer == "zeros":
-            if config.alpha_type == "vector":
-                self.alpha_cross_attn = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
-                self.alpha_dense = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
-            elif config.alpha_type == "float":
-                self.alpha_cross_attn = nn.Parameter(torch.zeros(1))
-                self.alpha_dense = nn.Parameter(torch.zeros(1))
-            else:
-                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-        elif config.alpha_initializer == "ones":
-            if config.alpha_type == "vector":
-                self.alpha_cross_attn = nn.Parameter(torch.ones(1, 1, self.embed_dim))
-                self.alpha_dense = nn.Parameter(torch.ones(1, 1, self.embed_dim))
-            elif config.alpha_type == "float":
-                self.alpha_cross_attn = nn.Parameter(torch.ones(1))
-                self.alpha_dense = nn.Parameter(torch.ones(1))
-            else:
-                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-        elif config.alpha_initializer in {"normal", "gaussian", "random"}:
-            if config.alpha_type == "vector":
-                self.alpha_cross_attn = nn.Parameter(
-                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, self.embed_dim))
-                )
-                self.alpha_dense = nn.Parameter(
-                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, self.embed_dim))
-                )
-            elif config.alpha_type == "float":
-                self.alpha_cross_attn = nn.Parameter(
-                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1))
-                )
-                self.alpha_dense = nn.Parameter(torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1)))
-            else:
-                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-        else:
-            raise NotImplementedError(f"Alpha initialization scheme {config.alpha_initializer} not yet implemented!")
-
-        assert hasattr(self, "alpha_cross_attn") and hasattr(self, "alpha_dense")
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        image_hidden_states: Optional[torch.Tensor] = None,
-        image_attention_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
-                `(encoder_attention_heads,)`.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        if image_hidden_states is None:
-            raise ValueError(
-                "`image_hidden_states` is required for VOPT cross attention module which are visual features to be"
-                " conditioned on."
-            )
-
-        if past_key_value is not None:
-            raise NotImplementedError("Past key value states are not implemented for VOPT cross attention module.")
-
-        residual = hidden_states
-
-        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
-        if self.do_layer_norm_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.cross_attn(
-            hidden_states=hidden_states,
-            key_value_states=image_hidden_states,
-            attention_mask=image_attention_mask,
-            layer_head_mask=layer_head_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        if self.normformer_layer_norms:
-            hidden_states = self.self_attn_post_layer_norm(hidden_states)
-        hidden_states = residual + self.act_cross_attn(self.alpha_cross_attn) * hidden_states
-
-        # 350m applies layer norm AFTER attention
-        if not self.do_layer_norm_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Fully Connected
-        hidden_states_shape = hidden_states.shape
-        hidden_states = hidden_states.reshape(-1, hidden_states.size(-1))
-        residual = hidden_states
-
-        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
-        if self.do_layer_norm_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-
-        hidden_states_to_gate = self.fc1(hidden_states)
-        if isinstance(self.activation_fn, SwiGLUActivation):
-            hidden_states = self.activation_fn(hidden_states_to_gate, hidden_states)
-        else:
-            hidden_states = self.activation_fn(hidden_states_to_gate)
-
-        if self.normformer_layer_norms:
-            hidden_states = self.mlp_post_layer_norm(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-        hidden_states = (residual + self.act_dense(self.alpha_dense) * hidden_states).view(hidden_states_shape)
-
-        # 350m applies layer norm AFTER attention
-        if not self.do_layer_norm_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-OPT_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`VOPTConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare OPT Model outputting raw hidden-states without any specific head on top.",
-    OPT_START_DOCSTRING,
-)
-class VOPTPreTrainedModel(VLOOMPreTrainedModelBase):
-    config_class = VOPTConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["OPTDecoderLayer", "VOPTGatedAttentionLayer", "CLIPEncoderLayer"]
-    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
-
-    def _init_weights(self, module):
-        def init_a_linear(module, mean=0.0, std=self.config.init_std):
-            with ContextManagers(deepspeed_gathered_parameters_context_manager(module.weight, modify=True)):
-                module.weight.data.normal_(mean=mean, std=std)
-                if module.bias is not None:
-                    with ContextManagers(deepspeed_gathered_parameters_context_manager(module.bias, modify=True)):
-                        module.bias.data.zero_()
-
-        if isinstance(module, VOPTGatedAttentionLayer):
-            for sub_module_name, sub_module in module.named_modules():
-                if isinstance(sub_module, nn.Linear):
-                    if "fc2" in sub_module_name:
-                        factor = 2 * self.config.num_hidden_layers
-                    else:
-                        factor = 1.0
-                    init_a_linear(sub_module, std=(0.4 / (sub_module.in_features * factor)) ** 0.5)
-        elif isinstance(module, PerceiverResampler):
-            with ContextManagers(deepspeed_gathered_parameters_context_manager(module.latents, modify=True)):
-                module.latents.data.normal_(mean=0.0, std=(1.0 / self.config.vision_embed_dim) ** 0.5)
-            for sub_module_name, sub_module in module.named_modules():
-                if isinstance(sub_module, nn.Linear):
-                    if "c_proj" in sub_module_name:
-                        factor = 2 * self.config.num_hidden_layers
-                    else:
-                        factor = 1.0
-                    init_a_linear(sub_module, std=(0.4 / (self.config.vision_embed_dim * factor)) ** 0.5)
-        elif isinstance(module, nn.Embedding):
-            with ContextManagers(deepspeed_gathered_parameters_context_manager(module.weight, modify=True)):
-                module.weight.data.normal_(mean=0.0, std=(1.0 / self.config.hidden_size) ** 0.5)
-                if module.padding_idx is not None:
-                    module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, DecoupledLinear):
-            if hasattr(module, "additional_fc"):
-                init_a_linear(module.additional_fc, std=(1.0 / (module.additional_fc.in_features)) ** 0.5)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (VOPTDecoder)):
-            module.gradient_checkpointing = value
-
-    @classmethod
-    def override_vision_model_wrapper(cls, model, config, vision_model_name, vision_model_params, torch_dtype):
-        # this can be called via from_pretrained from a class w/ head or w/o head so we extract the beheaded model version
-        beheaded_model = model.model if hasattr(model, "model") else model
-        cls.override_vision_model(beheaded_model.decoder, vision_model_name, vision_model_params, torch_dtype)
-        beheaded_model.freeze_relevant_params(config)
-
-
-OPT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-class VOPTDecoder(VOPTPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]
-
-    Args:
-        config: VOPTConfig
-    """
-
-    def __init__(self, config: VOPTConfig, vision_model=None):
-        super().__init__(config)
-        self.config = config
-        self.dropout = config.dropout
-        self.layerdrop = config.layerdrop
-        self.padding_idx = config.pad_token_id
-        self.max_target_positions = config.max_position_embeddings
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = DecoupledEmbedding(
-            num_embeddings=config.vocab_size,
-            num_additional_embeddings=config.additional_vocab_size,
-            embedding_dim=config.word_embed_proj_dim,
-            partially_freeze=config.freeze_text_layers,
-            padding_idx=self.padding_idx,
-        )
-        self.embed_positions = OPTLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)
-
-        # Load an uninitialized model and later in from_pretrained will load the pre-trained model -
-        # this solves the losing of weights in `from_pretrained` on the main model
-        self.vision_model = vision_model
-
-        # Perceiver Resampler
-        if config.use_resampler:
-            self.perceiver_resampler = PerceiverResampler(
-                self.config,
-                self.config.vision_embed_dim,
-                config.resampler_depth,
-                config.resampler_n_heads,
-                config.resampler_head_dim,
-                config.resampler_n_latents,
-            )
-
-        if config.word_embed_proj_dim != config.hidden_size:
-            self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias=False)
-        else:
-            self.project_in = None
-
-        self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-
-        self.cross_layer_interval = config.cross_layer_interval
-        num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
-        self.gated_cross_attn_layers = nn.ModuleList(
-            [VOPTGatedAttentionLayer(config) for i in range(num_cross_layers)]
-        )
-        self.gradient_checkpointing = False
-
-        # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
-        # with checkpoints that have been fine-tuned before transformers v4.20.1
-        # see https://github.com/facebookresearch/metaseq/pull/164
-        if config.do_layer_norm_before and not config._remove_final_layer_norm:
-            self.final_layer_norm = nn.LayerNorm(config.hidden_size)
-        else:
-            self.final_layer_norm = None
-
-        if config.word_embed_proj_dim != config.hidden_size:
-            self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False)
-        else:
-            self.project_out = None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_embeddings: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        crossblock_head_mask: Optional[torch.Tensor] = None,  # TOFO (ls): check if this is needed
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        r"""
-        Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-
-                Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-                [`PreTrainedTokenizer.__call__`] for details.
-
-                [What are input IDs?](../glossary#input-ids)
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-
-                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
-                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
-        if pixel_values is not None and image_embeddings is not None:
-            raise ValueError("You cannot specify both pixel_values and image_embeddings at the same time")
-        elif pixel_values is not None:
-            pixel_values = pixel_values.to(dtype=self.dtype, device=input_ids.device)  # fp16 compatibility
-            batch_size, num_images = pixel_values.size(0), pixel_values.size(1)
-            pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
-            # Get sequence from the vision encoder
-            image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state
-        elif image_embeddings is not None:
-            batch_size, num_images, image_seq_len, image_hidden_size = image_embeddings.size()
-            image_hidden_states = image_embeddings.to(dtype=self.dtype, device=input_ids.device)
-            image_hidden_states = image_hidden_states.view(batch_size * num_images, image_seq_len, image_hidden_size)
-
-        if self.config.use_resampler:
-            image_hidden_states = self.perceiver_resampler(image_hidden_states)
-        image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2)
-        image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size)
-        # Make image_attention_mask compatible with hidden states
-        text_seq_len = image_attention_mask.size(1)
-        image_attention_mask = image_attention_mask.unsqueeze(-1)
-        image_attention_mask = image_attention_mask.repeat(1, 1, 1, image_seq_len)
-        image_attention_mask = image_attention_mask.view(batch_size, text_seq_len, num_images * image_seq_len)
-
-        if image_hidden_states is not None:
-            image_batch_size, image_sequence_length, _ = image_hidden_states.size()
-            image_hidden_shape = (image_batch_size, image_sequence_length)
-            if image_attention_mask is None:
-                image_attention_mask = torch.ones(image_hidden_shape, device=device)
-            image_attention_mask = self.invert_attention_mask(image_attention_mask)
-        else:
-            image_attention_mask = None
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device)
-        pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
-
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, input_shape, inputs_embeds, past_key_values_length
-        )
-
-        if self.project_in is not None:
-            inputs_embeds = self.project_in(inputs_embeds)
-
-        hidden_states = inputs_embeds + pos_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        # check if head_mask has a correct number of layers specified if desired
-        for attn_mask, mask_name in zip([head_mask], ["head_mask"]):
-            if attn_mask is not None:
-                if attn_mask.size()[0] != (len(self.layers)):
-                    raise ValueError(
-                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
-                        f" {head_mask.size()[0]}."
-                    )
-
-        for idx, decoder_layer in enumerate(self.layers):
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-            layer_head_mask = head_mask[idx] if head_mask is not None else None
-
-            def vblock(
-                main_block,
-                hidden_states,
-                attention_mask,
-                layer_head_mask,
-                past_key_value,
-                image_hidden_states,
-                image_attention_mask,
-                output_attentions,
-                use_cache,
-                layer_idx,
-                cross_layer_interval,
-                gated_cross_attn_layers,
-            ):
-                # TODO(ls): Add cross attention values to respective lists
-                if layer_idx % cross_layer_interval == 0:
-                    xblock = gated_cross_attn_layers[layer_idx // cross_layer_interval]
-                    outputs = xblock(
-                        hidden_states,
-                        attention_mask=attention_mask,
-                        layer_head_mask=layer_head_mask,
-                        image_hidden_states=image_hidden_states,
-                        image_attention_mask=image_attention_mask,
-                        output_attentions=output_attentions,
-                        use_cache=use_cache,
-                        past_key_value=None,  # not implemented
-                    )
-                    hidden_states = outputs[0]
-
-                layer_outputs = main_block(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    layer_head_mask=layer_head_mask,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-                return layer_outputs
-
-            if self.gradient_checkpointing and self.training:
-                past_key_value = None
-                if use_cache:
-                    logger.warning_once(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    vblock,
-                    decoder_layer,
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    past_key_value,
-                    image_hidden_states,
-                    image_attention_mask,
-                    output_attentions,
-                    use_cache,
-                    idx,
-                    self.cross_layer_interval,
-                    self.gated_cross_attn_layers,
-                )
-            else:
-                layer_outputs = vblock(
-                    decoder_layer,
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    layer_head_mask=layer_head_mask,
-                    past_key_value=past_key_value,
-                    image_hidden_states=image_hidden_states,
-                    image_attention_mask=image_attention_mask,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    layer_idx=idx,
-                    cross_layer_interval=self.cross_layer_interval,
-                    gated_cross_attn_layers=self.gated_cross_attn_layers,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        if self.final_layer_norm is not None:
-            hidden_states = self.final_layer_norm(hidden_states)
-
-        if self.project_out is not None:
-            hidden_states = self.project_out(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-@add_start_docstrings(
-    "The bare OPT Model outputting raw hidden-states without any specific head on top.",
-    OPT_START_DOCSTRING,
-)
-class VOPTModel(VOPTPreTrainedModel):
-    def __init__(self, config: VOPTConfig, vision_model=None):
-        super().__init__(config)
-        self.decoder = VOPTDecoder(config, vision_model=vision_model)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        self.freeze_relevant_params(config)
-
-    def freeze_relevant_params(self, config=None):
-        if config is None:
-            config = self.config
-
-        if config.freeze_text_layers:
-            self.freeze_text_layers(config.freeze_text_module_exceptions)
-
-        if config.freeze_vision_layers:
-            freeze_model(self.decoder.vision_model, module_exceptions=config.freeze_vision_module_exceptions)
-
-    def freeze_text_layers(self, module_exceptions):
-        for module in [self.decoder.embed_positions, self.decoder.layers]:
-            freeze_model(module, module_exceptions=module_exceptions)
-
-        if self.decoder.project_out is not None:
-            freeze_model(self.decoder.project_out, module_exceptions=module_exceptions)
-
-        if self.decoder.final_layer_norm is not None:
-            freeze_model(self.decoder.final_layer_norm, module_exceptions=module_exceptions)
-
-    def get_input_embeddings(self):
-        return self.decoder.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.decoder.embed_tokens = value
-
-    def get_decoder(self):
-        return self.decoder
-
-    @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPast,
-        config_class=_CONFIG_FOR_DOC,
-        expected_output=_EXPECTED_OUTPUT_SHAPE,
-    )
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_embeddings: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        crossblock_head_mask: Optional[torch.Tensor] = None,  # TOFO (ls): check if this is needed
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
-        decoder_outputs = self.decoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            pixel_values=pixel_values,
-            image_embeddings=image_embeddings,
-            image_attention_mask=image_attention_mask,
-            crossblock_head_mask=crossblock_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        if not return_dict:
-            return decoder_outputs
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            past_key_values=decoder_outputs.past_key_values,
-            hidden_states=decoder_outputs.hidden_states,
-            attentions=decoder_outputs.attentions,
-        )
-
-
-class VOPTForCausalLM(VOPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-
-    def __init__(self, config, vision_model=None):
-        super().__init__(config)
-
-        # Initialize LM head first so that it is not directly offloaded to the CPU/disk
-        # the lm_head weight is automatically tied to the embed tokens weight
-        self.lm_head = DecoupledLinear(
-            in_features=config.word_embed_proj_dim,
-            out_features=config.vocab_size,
-            out_additional_features=config.additional_vocab_size,
-            bias=False,
-            partially_freeze=config.freeze_lm_head,
-        )
-
-        self.model = VOPTModel(config, vision_model=vision_model)
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def tie_weights(self):
-        """
-        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
-        """
-        output_embeddings = self.get_output_embeddings()
-        input_embeddings = self.get_input_embeddings()
-
-        if getattr(self.config, "tie_word_embeddings", True):
-            output_embeddings.weight = input_embeddings.weight
-            if input_embeddings.num_additional_embeddings > 0:
-                assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
-                output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
-
-        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
-            output_embeddings.out_features = input_embeddings.num_embeddings
-            if hasattr(output_embeddings, "out_additional_features") and hasattr(
-                input_embeddings, "num_additional_embeddings"
-            ):
-                output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
-
-    def get_input_embeddings(self):
-        return self.model.decoder.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.decoder.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model.decoder = decoder
-
-    def get_decoder(self):
-        return self.model.decoder
-
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_embeddings: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        crossblock_head_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-
-                Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-                [`PreTrainedTokenizer.__call__`] for details.
-
-                [What are input IDs?](../glossary#input-ids)
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
-                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
-
-                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
-                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import GPT2Tokenizer, OPTForCausalLM
-
-        >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
-        >>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
-
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model.decoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            pixel_values=pixel_values,
-            image_embeddings=image_embeddings,
-            image_attention_mask=image_attention_mask,
-            crossblock_head_mask=crossblock_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        logits = self.lm_head(outputs[0]).contiguous()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
-                shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
-                shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
-            else:
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
-        inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
-        unwanted_kwargs = ["position_ids", "token_type_ids"]
-        for kwarg in unwanted_kwargs:
-            inputs.pop(kwarg, None)
-        return inputs
-
-    @staticmethod
-    def _expand_inputs_for_generation(
-        *args,
-        **model_kwargs,
-    ):
-        return expand_inputs_for_generation(*args, **model_kwargs)
-
-    @staticmethod
-    def _update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
-        return update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder)
-
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
-        return reordered_past
-
-    def get_model_tflops_per_batch_per_gpu(self, hparams, data_param, tokenizer, max_num_images):
-        config_vl_model = self.config
-
-        language_embed_size = config_vl_model.hidden_size
-        num_language_layers = config_vl_model.num_hidden_layers
-        ffn_inner_size = config_vl_model.ffn_dim
-
-        vision_config = self.model.decoder.vision_model.config
-        if hasattr(vision_config, "vision_config"):
-            vision_config = vision_config.vision_config
-
-        # Get vision model blocks infos
-        vision_patch_size = vision_config.patch_size
-        vision_hidden_size = vision_config.hidden_size
-        num_vision_layers = vision_config.num_hidden_layers
-        # The +1 is for the CLS token
-        single_image_seq_len = (vision_config.image_size // vision_patch_size) ** 2 + 1
-        vision_exp_factor = vision_config.intermediate_size // vision_hidden_size
-
-        # Get language and cross-att blocks infos
-        num_cross_attn_layers = num_language_layers // config_vl_model.cross_layer_interval
-        language_seq_len = data_param.max_seq_len
-        language_exp_factor = (ffn_inner_size // language_embed_size) if ffn_inner_size is not None else 4
-        cross_att_exp_factor = (ffn_inner_size // language_embed_size) if ffn_inner_size is not None else 4
-        k_v_cross_attn_seq_len = (
-            (self.config.resampler_n_latents * max_num_images)
-            if self.config.use_resampler
-            else (single_image_seq_len * max_num_images)
-        )
-
-        language_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_language_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=language_seq_len,
-            k_seq_len=language_seq_len,
-            hidden_size=language_embed_size,
-            kv_in_dim=language_embed_size,
-            ff_exp_factor=language_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=False,
-            vocab_size=tokenizer.vocab_size,
-            count_backward=True,  # Always True regardless of freezing, because gradients are computed for cross-attentions
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        cross_attention_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_cross_attn_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=language_seq_len,
-            k_seq_len=k_v_cross_attn_seq_len,
-            hidden_size=language_embed_size,
-            kv_in_dim=vision_hidden_size,
-            ff_exp_factor=cross_att_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=self.config.cross_layer_activation_function == "swiglu",
-            vocab_size=None,
-            count_backward=True,
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        vision_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_vision_layers,
-            batch_size=hparams.batch_size_per_gpu * max_num_images,
-            q_seq_len=single_image_seq_len,
-            k_seq_len=single_image_seq_len,
-            hidden_size=vision_hidden_size,
-            kv_in_dim=vision_hidden_size,
-            ff_exp_factor=vision_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            swiglu=False,
-            vocab_size=None,
-            count_backward=not hparams.model_params["freeze_vision_layers"],
-            use_grad_checkpointing=hparams.gradient_checkpointing,
-        )
-        if self.config.use_resampler:
-            perceiver_tflops_per_batch_per_gpu = compute_perceiver_tflops_per_batch_per_gpu(
-                num_layers=self.config.resampler_depth,
-                batch_size=hparams.batch_size_per_gpu * max_num_images,
-                q_seq_len=self.config.resampler_n_latents,
-                vision_embed_seq_len=single_image_seq_len,
-                q_k_v_input_dim=vision_hidden_size,
-                attention_hidden_size=self.config.resampler_n_heads * self.config.resampler_head_dim,
-                ff_exp_factor=cross_att_exp_factor,
-                count_backward=True,
-                use_grad_checkpointing=hparams.gradient_checkpointing,
-            )
-            flop_count = (
-                language_tflops_per_batch_per_gpu
-                + cross_attention_tflops_per_batch_per_gpu
-                + vision_tflops_per_batch_per_gpu
-                + perceiver_tflops_per_batch_per_gpu
-            )
-        else:
-            flop_count = (
-                language_tflops_per_batch_per_gpu
-                + cross_attention_tflops_per_batch_per_gpu
-                + vision_tflops_per_batch_per_gpu
-            )
-        return flop_count
diff --git a/m4/models/vt5/__init__.py b/m4/models/vt5/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/m4/models/vt5/configuration_vt5.py b/m4/models/vt5/configuration_vt5.py
deleted file mode 100644
index 8b1098bb47102caf5e9609556acb43d3d14ba374..0000000000000000000000000000000000000000
--- a/m4/models/vt5/configuration_vt5.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# coding=utf-8
-# Copyright 2020, The T5 Authors and HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" VT5 model configuration"""
-import os
-from typing import Tuple, Union
-
-from transformers import AutoConfig
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "t5-small": "https://huggingface.co/t5-small/resolve/main/config.json",
-    "t5-base": "https://huggingface.co/t5-base/resolve/main/config.json",
-    "t5-large": "https://huggingface.co/t5-large/resolve/main/config.json",
-    "t5-3b": "https://huggingface.co/t5-3b/resolve/main/config.json",
-    "t5-11b": "https://huggingface.co/t5-11b/resolve/main/config.json",
-}
-
-
-class VT5Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`T5Model`] or a [`TFT5Model`]. It is used to
-    instantiate a T5 model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the T5
-    [t5-small](https://huggingface.co/t5-small) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    TODO: this doc is completely out of sync with the actual args
-
-    Arguments:
-        vocab_size (`int`, *optional*, defaults to 32128):
-            Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
-        d_model (`int`, *optional*, defaults to 512):
-            Size of the encoder layers and the pooler layer.
-        d_kv (`int`, *optional*, defaults to 64):
-            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
-            num_heads`.
-        d_ff (`int`, *optional*, defaults to 2048):
-            Size of the intermediate feed forward layer in each `T5Block`.
-        num_layers (`int`, *optional*, defaults to 6):
-            Number of hidden layers in the Transformer encoder.
-        num_decoder_layers (`int`, *optional*):
-            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
-        num_heads (`int`, *optional*, defaults to 8):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
-            The number of buckets to use for each attention layer.
-        relative_attention_max_distance (`int`, *optional*, defaults to 128):
-            The maximum distance of the longer sequences for the bucket separation.
-        dropout_rate (`float`, *optional*, defaults to 0.1):
-            The ratio for all dropout layers.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
-            The epsilon used by the layer normalization layers.
-        initializer_factor (`float`, *optional*, defaults to 1):
-            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
-            testing).
-        feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
-            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. T5v1.1 uses the
-            `"gated-gelu"` feed forward projection. Original T5 uses `"relu"`.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-        additional_vocab_size (`int`, *optional`, defaults to 0):
-            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
-            are always trainable whereas regular vocab tokens can be frozen or not.
-        alpha_initializer (`str`, *optional*, defaults to `"ones"`):
-            Initialization type for the alphas.
-        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
-            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross Attention.
-        alpha_type (`str`, *optional*, defaults to `"vector"`):
-            Whether the gating alphas should be vectors or single floats.
-    """
-    model_type = "vt5"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
-
-    def __init__(
-        self,
-        vocab_size=32128,
-        d_model=512,
-        d_kv=64,
-        d_ff=2048,
-        num_layers=6,
-        num_decoder_layers=None,
-        num_heads=8,
-        relative_attention_num_buckets=32,
-        relative_attention_max_distance=128,
-        dropout_rate=0.1,
-        layer_norm_epsilon=1e-6,
-        initializer_factor=1.0,
-        feed_forward_proj="relu",
-        is_encoder_decoder=True,
-        use_cache=True,
-        pad_token_id=0,
-        eos_token_id=1,
-        additional_vocab_size=0,
-        alpha_initializer="ones",
-        alphas_initializer_range=0.0,
-        alpha_type="vector",
-        cross_layer_interval=1,
-        tie_word_embeddings=False,
-        freeze_text_layers=True,
-        freeze_lm_head=False,
-        freeze_vision_layers=True,
-        vision_model_name="google/vit-base-patch16-224",
-        vision_model_params="{}",
-        vision_embed_dim=768,
-        image_token_index=32128,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.additional_vocab_size = additional_vocab_size
-        self.d_model = d_model
-        self.d_kv = d_kv
-        self.d_ff = d_ff
-        self.num_layers = num_layers
-        self.num_decoder_layers = (
-            num_decoder_layers if num_decoder_layers is not None else self.num_layers
-        )  # default = symmetry
-        self.num_heads = num_heads
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.relative_attention_max_distance = relative_attention_max_distance
-        self.dropout_rate = dropout_rate
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_factor = initializer_factor
-        self.feed_forward_proj = feed_forward_proj
-        self.use_cache = use_cache
-
-        act_info = self.feed_forward_proj.split("-")
-        self.dense_act_fn = act_info[-1]
-        self.is_gated_act = act_info[0] == "gated"
-
-        if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
-            raise ValueError(
-                f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer."
-                "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
-                "'gated-gelu' or 'relu'"
-            )
-
-        # for backwards compatibility
-        if feed_forward_proj == "gated-gelu":
-            self.dense_act_fn = "gelu_new"
-
-        self.alpha_initializer = alpha_initializer
-        self.alphas_initializer_range = alphas_initializer_range
-        self.alpha_type = alpha_type
-
-        self.cross_layer_interval = cross_layer_interval
-        self.freeze_vision_layers = freeze_vision_layers
-        self.vision_model_name = vision_model_name
-        self.vision_model_params = vision_model_params
-
-        self.tie_word_embeddings = tie_word_embeddings
-        self.freeze_text_layers = freeze_text_layers
-        self.freeze_lm_head = freeze_lm_head
-        self.image_token_index = image_token_index
-
-        self.vision_embed_dim = vision_embed_dim
-
-        # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
-        # PretrainedConfig.from_dict first instantiates the class with the config dict and only then
-        # updates the config object with `kwargs` from from_pretrained, so during the instantiation
-        # of this object many attributes have default values and haven't yet been overridden.
-        # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            is_encoder_decoder=is_encoder_decoder,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def check_compatibilities(self):
-        if self.tie_word_embeddings and (self.freeze_text_layers != self.freeze_lm_head):
-            raise ValueError(
-                "if `tie_word_embeddings` is True, then `freeze_lm_head` and `freeze_text_layers` must be equal."
-            )
-
-        vision_model_params = eval(self.vision_model_params)
-        config = AutoConfig.from_pretrained(self.vision_model_name, **vision_model_params)
-        if hasattr(config, "vision_config"):
-            vison_config = config.vision_config
-        else:
-            vison_config = config
-        vision_embed_dim = vison_config.hidden_size
-        if self.vision_embed_dim != vision_embed_dim:
-            raise ValueError(
-                f"vision_embed_dim ({self.vision_embed_dim}) must match the hidden size of the vision model"
-                f" ({vision_embed_dim})"
-            )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        outputs = super(VT5Config, cls).from_pretrained(pretrained_model_name_or_path, **kwargs)
-        if isinstance(outputs, Tuple):
-            # When called with return_unused_kwargs=True, the first item will be the config
-            outputs[0].check_compatibilities()
-        else:
-            outputs.check_compatibilities()
-        return outputs
diff --git a/m4/models/vt5/modeling_vt5.py b/m4/models/vt5/modeling_vt5.py
deleted file mode 100644
index 94687a33a7ff6366a185b9801104340e83d6ff92..0000000000000000000000000000000000000000
--- a/m4/models/vt5/modeling_vt5.py
+++ /dev/null
@@ -1,2188 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch T5 model."""
-
-
-import copy
-import math
-import os
-import warnings
-from typing import Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from torch.utils.checkpoint import checkpoint
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPastAndCrossAttentions,
-    Seq2SeqLMOutput,
-    Seq2SeqModelOutput,
-)
-from transformers.models.t5.configuration_t5 import T5Config
-from transformers.pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from transformers.utils import (
-    DUMMY_INPUTS,
-    DUMMY_MASK,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_torch_fx_proxy,
-    replace_return_docstrings,
-)
-from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
-
-from m4.models import DecoupledEmbedding, DecoupledLinear
-from m4.models.custom_modules import VLOOMPreTrainedModelBase
-from m4.models.vt5.configuration_vt5 import VT5Config
-from m4.training.packing import random_spans_helper
-from m4.training.utils import compute_tflops_per_batch_per_gpu, freeze_model
-from m4.utils import logging
-
-
-ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "T5Config"
-_TOKENIZER_FOR_DOC = "T5Tokenizer"
-_CHECKPOINT_FOR_DOC = "t5-small"
-
-####################################################
-# This dict contains ids and associated url
-# for the pretrained weights provided with the models
-####################################################
-T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "t5-small",
-    "t5-base",
-    "t5-large",
-    "t5-3b",
-    "t5-11b",
-    # See all T5 models at https://huggingface.co/models?filter=t5
-]
-
-
-####################################################
-# This is a conversion method from TF 1.0 to PyTorch
-# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
-####################################################
-def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
-    """Load tf checkpoints in a pytorch model."""
-    try:
-        import re
-
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    tf_weights = {}
-    for name, shape in init_vars:
-        logger.info(f"Loading TF weight {name} with shape {shape}")
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        tf_weights[name] = array
-
-    for txt_name in names:
-        name = txt_name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(
-            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
-            for n in name
-        ):
-            logger.info(f"Skipping {'/'.join(name)}")
-            tf_weights.pop(txt_name, None)
-            continue
-        if "_slot_" in name[-1]:
-            logger.info(f"Skipping {'/'.join(name)}")
-            tf_weights.pop(txt_name, None)
-            continue
-        pointer = model
-        array = tf_weights[txt_name]
-
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] in ["kernel", "scale", "embedding"]:
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "self_attention":
-                pointer = getattr(pointer, "layer")
-                pointer = pointer[0]
-            elif scope_names[0] == "enc_dec_attention":
-                pointer = getattr(pointer, "layer")
-                pointer = pointer[1]
-            elif scope_names[0] == "dense_relu_dense":
-                pointer = getattr(pointer, "layer")
-                pointer = pointer[2]
-            elif scope_names[0] == "rms_norm":
-                if hasattr(pointer, "layer_norm"):
-                    pointer = getattr(pointer, "layer_norm")
-                elif hasattr(pointer, "final_layer_norm"):
-                    pointer = getattr(pointer, "final_layer_norm")
-            elif scope_names[0] == "scale":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            elif scope_names[0] == "decoder" and name[1] == "logits":
-                continue
-            elif scope_names[0] == "logits":
-                pointer = getattr(pointer, "lm_head")
-            elif scope_names[0] == "wi" and len(scope_names) > 1 and scope_names[1].isdigit():
-                pointer = getattr(pointer, f"wi_{scope_names[1]}")
-                continue
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info(f"Skipping {'/'.join(name)}")
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if scope_names[0] not in ["kernel", "scale", "embedding"]:
-            pointer = getattr(pointer, "weight")
-        if scope_names[0] != "embedding":
-            logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
-            array = np.transpose(array)
-        try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info(f"Initialize PyTorch weight {name}")
-        pointer.data = torch.from_numpy(array.astype(np.float32))
-        tf_weights.pop(txt_name, None)
-
-    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.")
-    return model
-
-
-####################################################
-# PyTorch Models are constructed by sub-classing
-# - torch.nn.Module for the layers and
-# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
-####################################################
-PARALLELIZE_DOCSTRING = r"""
-    This is an experimental feature and is a subject to change at a moment's notice.
-
-    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
-    it will evenly distribute blocks across all devices.
-
-    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
-            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
-            automatically mapped to the first device (for esoteric reasons). That means that the first device should
-            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
-            following number of attention modules:
-
-                - t5-small: 6
-                - t5-base: 12
-                - t5-large: 24
-                - t5-3b: 24
-                - t5-11b: 24
-
-    Example:
-
-    ```python
-    # Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules:
-    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
-    device_map = {
-        0: [0, 1, 2],
-        1: [3, 4, 5, 6, 7, 8, 9],
-        2: [10, 11, 12, 13, 14, 15, 16],
-        3: [17, 18, 19, 20, 21, 22, 23],
-    }
-    model.parallelize(device_map)
-    ```
-"""
-DEPARALLELIZE_DOCSTRING = r"""
-    Moves the model to cpu from a model parallel state.
-
-    Example:
-
-    ```python
-    # On a 4 GPU machine with t5-3b:
-    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
-    device_map = {
-        0: [0, 1, 2],
-        1: [3, 4, 5, 6, 7, 8, 9],
-        2: [10, 11, 12, 13, 14, 15, 16],
-        3: [17, 18, 19, 20, 21, 22, 23],
-    }
-    model.parallelize(device_map)  # Splits the model across several devices
-    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
-    ```
-"""
-
-
-class T5LayerNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
-        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
-        # half-precision inputs is done in fp32
-
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-try:
-    from apex.normalization import FusedRMSNorm
-
-    T5LayerNorm = FusedRMSNorm  # noqa
-
-    logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm")
-except ImportError:
-    # using the normal T5LayerNorm
-    pass
-except Exception:
-    logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm")
-    pass
-
-ALL_LAYERNORM_LAYERS = [nn.LayerNorm, T5LayerNorm]
-
-
-class T5DenseActDense(nn.Module):
-    def __init__(self, config: T5Config):
-        super().__init__()
-        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
-        self.act = ACT2FN[config.dense_act_fn]
-
-    def forward(self, hidden_states):
-        hidden_states = self.wi(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-
-
-class T5DenseGatedActDense(nn.Module):
-    def __init__(self, config: T5Config):
-        super().__init__()
-        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
-        self.act = ACT2FN[config.dense_act_fn]
-
-    def forward(self, hidden_states):
-        hidden_gelu = self.act(self.wi_0(hidden_states))
-        hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-
-
-class T5LayerFF(nn.Module):
-    def __init__(self, config: T5Config, is_vision_cross_attention=False):
-        super().__init__()
-        if config.is_gated_act:
-            self.DenseReluDense = T5DenseGatedActDense(config)
-        else:
-            self.DenseReluDense = T5DenseActDense(config)
-
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-        self.is_vision_cross_attention = is_vision_cross_attention
-        if is_vision_cross_attention:
-            self.act = nn.Tanh()
-            if config.alpha_initializer == "zeros":
-                if config.alpha_type == "vector":
-                    self.alpha_dense = nn.Parameter(torch.zeros(1, 1, config.d_model))
-                elif config.alpha_type == "float":
-                    self.alpha_dense = nn.Parameter(torch.zeros(1))
-                else:
-                    raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-            elif config.alpha_initializer == "ones":
-                if config.alpha_type == "vector":
-                    self.alpha_dense = nn.Parameter(torch.ones(1, 1, config.d_model))
-                elif config.alpha_type == "float":
-                    self.alpha_dense = nn.Parameter(torch.ones(1))
-                else:
-                    raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-            elif config.alpha_initializer in {"normal", "gaussian", "random"}:
-                if config.alpha_type == "vector":
-                    self.alpha_dense = nn.Parameter(
-                        torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, config.d_model))
-                    )
-                elif config.alpha_type == "float":
-                    self.alpha_dense = nn.Parameter(
-                        torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1))
-                    )
-                else:
-                    raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-            else:
-                raise NotImplementedError(
-                    f"Alpha initialization scheme {config.alpha_initializer} not yet implemented!"
-                )
-
-    def forward(self, hidden_states):
-        forwarded_states = self.layer_norm(hidden_states)
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        if not self.is_vision_cross_attention:
-            hidden_states = hidden_states + self.dropout(forwarded_states)
-        else:
-            hidden_states = hidden_states + self.dropout(self.act(self.alpha_dense) * forwarded_states)
-        return hidden_states
-
-
-class T5Attention(nn.Module):
-    def __init__(self, config: T5Config, has_relative_attention_bias=False, is_vision_cross_attention=False):
-        super().__init__()
-        self.is_decoder = config.is_decoder
-        self.has_relative_attention_bias = has_relative_attention_bias
-        self.relative_attention_num_buckets = config.relative_attention_num_buckets
-        self.relative_attention_max_distance = config.relative_attention_max_distance
-        self.d_model = config.d_model
-        self.key_value_proj_dim = config.d_kv
-        self.n_heads = config.num_heads
-        self.dropout = config.dropout_rate
-        self.inner_dim = self.n_heads * self.key_value_proj_dim
-
-        # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        if not is_vision_cross_attention:
-            self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
-            self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        else:
-            vision_embed_dim = self.embed_dim if not hasattr(config, "vision_embed_dim") else config.vision_embed_dim
-            self.k = nn.Linear(vision_embed_dim, self.inner_dim, bias=False)
-            self.v = nn.Linear(vision_embed_dim, self.inner_dim, bias=False)
-        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
-
-        if self.has_relative_attention_bias:
-            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
-        self.pruned_heads = set()
-        self.gradient_checkpointing = False
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
-        )
-        # Prune linear layers
-        self.q = prune_linear_layer(self.q, index)
-        self.k = prune_linear_layer(self.k, index)
-        self.v = prune_linear_layer(self.v, index)
-        self.o = prune_linear_layer(self.o, index, dim=1)
-        # Update hyper params
-        self.n_heads = self.n_heads - len(heads)
-        self.inner_dim = self.key_value_proj_dim * self.n_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    @staticmethod
-    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
-        """
-        Adapted from Mesh Tensorflow:
-        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
-
-        Translate relative position to a bucket number for relative attention. The relative position is defined as
-        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
-        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
-        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
-        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
-        This should allow for more graceful generalization to longer sequences than the model has been trained on
-
-        Args:
-            relative_position: an int32 Tensor
-            bidirectional: a boolean - whether the attention is bidirectional
-            num_buckets: an integer
-            max_distance: an integer
-
-        Returns:
-            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
-        """
-        relative_buckets = 0
-        if bidirectional:
-            num_buckets //= 2
-            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
-            relative_position = torch.abs(relative_position)
-        else:
-            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
-        # now relative_position is in the range [0, inf)
-
-        # half of the buckets are for exact increments in positions
-        max_exact = num_buckets // 2
-        is_small = relative_position < max_exact
-
-        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
-        relative_position_if_large = max_exact + (
-            torch.log(relative_position.float() / max_exact)
-            / math.log(max_distance / max_exact)
-            * (num_buckets - max_exact)
-        ).to(torch.long)
-        relative_position_if_large = torch.min(
-            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
-        )
-
-        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
-        return relative_buckets
-
-    def compute_bias(self, query_length, key_length, device=None):
-        """Compute binned relative position bias"""
-        if device is None:
-            device = self.relative_attention_bias.weight.device
-        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
-        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
-        relative_position = memory_position - context_position  # shape (query_length, key_length)
-        relative_position_bucket = self._relative_position_bucket(
-            relative_position,  # shape (query_length, key_length)
-            bidirectional=(not self.is_decoder),
-            num_buckets=self.relative_attention_num_buckets,
-            max_distance=self.relative_attention_max_distance,
-        )
-        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
-        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
-        return values
-
-    def forward(
-        self,
-        hidden_states,
-        mask=None,
-        key_value_states=None,
-        position_bias=None,
-        past_key_value=None,
-        layer_head_mask=None,
-        query_length=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        """
-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
-        """
-        # Input is (batch_size, seq_length, dim)
-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-        batch_size, seq_length = hidden_states.shape[:2]
-
-        real_seq_length = seq_length
-
-        if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
-            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
-
-        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
-
-        def shape(states):
-            """projection"""
-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
-
-        def unshape(states):
-            """reshape"""
-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-
-        def project(hidden_states, proj_layer, key_value_states, past_key_value):
-            """projects hidden states correctly to key/query states"""
-            if key_value_states is None:
-                # self-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(hidden_states))
-            elif past_key_value is None:
-                # cross-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(key_value_states))
-
-            if past_key_value is not None:
-                if key_value_states is None:
-                    # self-attn
-                    # (batch_size, n_heads, key_length, dim_per_head)
-                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
-                else:
-                    # cross-attn
-                    hidden_states = past_key_value
-            return hidden_states
-
-        # get query states
-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
-
-        # get key/value states
-        key_states = project(
-            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
-        )
-        value_states = project(
-            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
-        )
-
-        # compute scores
-        scores = torch.matmul(
-            query_states, key_states.transpose(3, 2)
-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
-
-        if position_bias is None:
-            if not self.has_relative_attention_bias:
-                position_bias = torch.zeros(
-                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
-                )
-                if self.gradient_checkpointing and self.training:
-                    position_bias.requires_grad = True
-            else:
-                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
-
-            # if key and values are already calculated
-            # we want only the last query position bias
-            if past_key_value is not None:
-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
-
-            if mask is not None:
-                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
-
-        if self.pruned_heads:
-            mask = torch.ones(position_bias.shape[1])
-            mask[list(self.pruned_heads)] = 0
-            position_bias_masked = position_bias[:, mask.bool()]
-        else:
-            position_bias_masked = position_bias
-
-        scores += position_bias_masked
-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
-            scores
-        )  # (batch_size, n_heads, seq_length, key_length)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.dropout, training=self.training
-        )  # (batch_size, n_heads, seq_length, key_length)
-
-        # Mask heads if we want to
-        if layer_head_mask is not None:
-            attn_weights = attn_weights * layer_head_mask
-
-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
-        attn_output = self.o(attn_output)
-
-        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
-
-        if output_attentions:
-            outputs = outputs + (attn_weights,)
-        return outputs
-
-
-class T5LayerSelfAttention(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        position_bias=None,
-        layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        normed_hidden_states = self.layer_norm(hidden_states)
-        attention_output = self.SelfAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states = hidden_states + self.dropout(attention_output[0])
-        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
-        return outputs
-
-
-class T5LayerCrossAttention(nn.Module):
-    def __init__(self, config, is_vision_cross_attention=False):
-        super().__init__()
-        self.EncDecAttention = T5Attention(
-            config, has_relative_attention_bias=False, is_vision_cross_attention=is_vision_cross_attention
-        )
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-        self.is_vision_cross_attention = is_vision_cross_attention
-        if is_vision_cross_attention:
-            self.act = nn.Tanh()
-            if config.alpha_initializer == "zeros":
-                if config.alpha_type == "vector":
-                    self.alpha_cross_attn = nn.Parameter(torch.zeros(1, 1, config.d_model))
-                elif config.alpha_type == "float":
-                    self.alpha_cross_attn = nn.Parameter(torch.zeros(1))
-                else:
-                    raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-            elif config.alpha_initializer == "ones":
-                if config.alpha_type == "vector":
-                    self.alpha_cross_attn = nn.Parameter(torch.ones(1, 1, config.d_model))
-                elif config.alpha_type == "float":
-                    self.alpha_cross_attn = nn.Parameter(torch.ones(1))
-                else:
-                    raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-            elif config.alpha_initializer in {"normal", "gaussian", "random"}:
-                if config.alpha_type == "vector":
-                    self.alpha_cross_attn = nn.Parameter(
-                        torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, config.d_model))
-                    )
-                elif config.alpha_type == "float":
-                    self.alpha_cross_attn = nn.Parameter(
-                        torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1))
-                    )
-                else:
-                    raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
-
-            else:
-                raise NotImplementedError(
-                    f"Alpha initialization scheme {config.alpha_initializer} not yet implemented!"
-                )
-
-    def forward(
-        self,
-        hidden_states,
-        key_value_states,
-        attention_mask=None,
-        position_bias=None,
-        layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        query_length=None,
-        output_attentions=False,
-    ):
-        normed_hidden_states = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            key_value_states=key_value_states,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
-            query_length=query_length,
-            output_attentions=output_attentions,
-        )
-        if not self.is_vision_cross_attention:
-            layer_output = hidden_states + self.dropout(attention_output[0])
-        else:
-            layer_output = hidden_states + self.dropout(self.act(self.alpha_cross_attn) * attention_output[0])
-        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
-        return outputs
-
-
-class T5Block(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.is_decoder = config.is_decoder
-        self.layer = nn.ModuleList()
-        self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
-        if self.is_decoder:
-            self.layer.append(T5LayerCrossAttention(config))
-
-        self.layer.append(T5LayerFF(config))
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        position_bias=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-        layer_head_mask=None,
-        cross_attn_layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-        return_dict=True,
-    ):
-        if past_key_value is not None:
-            if not self.is_decoder:
-                logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
-            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
-
-            if len(past_key_value) != expected_num_past_key_values:
-                raise ValueError(
-                    f"There should be {expected_num_past_key_values} past states. "
-                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-                    f"Got {len(past_key_value)} past key / value states"
-                )
-
-            self_attn_past_key_value = past_key_value[:2]
-            cross_attn_past_key_value = past_key_value[2:]
-        else:
-            self_attn_past_key_value, cross_attn_past_key_value = None, None
-
-        self_attention_outputs = self.layer[0](
-            hidden_states,
-            attention_mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=self_attn_past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states, present_key_value_state = self_attention_outputs[:2]
-        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
-        if do_cross_attention:
-            # the actual query length is unknown for cross attention
-            # if using past key value states. Need to inject it here
-            if present_key_value_state is not None:
-                query_length = present_key_value_state[0].shape[2]
-            else:
-                query_length = None
-
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                position_bias=encoder_decoder_position_bias,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                query_length=query_length,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-            hidden_states = cross_attention_outputs[0]
-
-            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-            # Combine self attn and cross attn key value states
-            if present_key_value_state is not None:
-                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
-
-            # Keep cross-attention outputs and relative position weights
-            attention_outputs = attention_outputs + cross_attention_outputs[2:]
-
-        # Apply Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states)
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        outputs = (hidden_states,)
-
-        if use_cache:
-            outputs = outputs + (present_key_value_state,) + attention_outputs
-        else:
-            outputs = outputs + attention_outputs
-
-        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-
-
-class VT5GatedCrossAttentionBlock(nn.Module):
-    """Implementing the gated cross attention from the text LM to the vision encoder output."""
-
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.is_decoder = config.is_decoder
-        self.layer = nn.ModuleList()
-        self.layer.append(T5LayerCrossAttention(config, is_vision_cross_attention=True))
-        self.layer.append(T5LayerFF(config, is_vision_cross_attention=True))
-
-    def forward(
-        self,
-        hidden_states,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-        cross_attn_layer_head_mask=None,
-        use_cache=False,
-        output_attentions=False,
-        return_dict=True,
-    ):
-        cross_attention_outputs = self.layer[0](
-            hidden_states,
-            key_value_states=encoder_hidden_states,
-            attention_mask=encoder_attention_mask,
-            position_bias=encoder_decoder_position_bias,
-            layer_head_mask=cross_attn_layer_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states, _ = cross_attention_outputs[
-            :2
-        ]  # In the standard case, `_` would be `present_key_value_state`. But in the case of vision cross attention for the lm encoder, `present_key_value_state` is always None. So I am directly simplifying the logic by simply removing it and setting it to None where it should be None.
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        # Apply Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states)
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        outputs = (hidden_states,)
-
-        if use_cache:
-            outputs = outputs + (None,) + cross_attention_outputs[2:]
-        else:
-            outputs = outputs + cross_attention_outputs[2:]
-
-        return outputs
-
-
-class VT5PreTrainedModel(VLOOMPreTrainedModelBase):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = VT5Config
-    load_tf_weights = load_tf_weights_in_t5
-    base_model_prefix = "transformer"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["T5Block"]
-
-    @property
-    def dummy_inputs(self):
-        input_ids = torch.tensor(DUMMY_INPUTS)
-        input_mask = torch.tensor(DUMMY_MASK)
-        dummy_inputs = {
-            "decoder_input_ids": input_ids,
-            "input_ids": input_ids,
-            "decoder_attention_mask": input_mask,
-        }
-        return dummy_inputs
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        factor = self.config.initializer_factor  # Used for testing weights initialization
-        if isinstance(module, T5LayerNorm):
-            module.weight.data.fill_(factor * 1.0)
-        elif isinstance(module, (VT5Model, VT5ForConditionalGeneration)):
-            # Mesh TensorFlow embeddings initialization
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
-            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
-            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
-                module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
-        elif isinstance(module, T5DenseActDense):
-            # Mesh TensorFlow FF initialization
-            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
-            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
-            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.wi, "bias") and module.wi.bias is not None:
-                module.wi.bias.data.zero_()
-            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
-            if hasattr(module.wo, "bias") and module.wo.bias is not None:
-                module.wo.bias.data.zero_()
-        elif isinstance(module, T5DenseGatedActDense):
-            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
-                module.wi_0.bias.data.zero_()
-            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
-                module.wi_1.bias.data.zero_()
-            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
-            if hasattr(module.wo, "bias") and module.wo.bias is not None:
-                module.wo.bias.data.zero_()
-        elif isinstance(module, T5Attention):
-            # Mesh TensorFlow attention initialization to avoid scaling before softmax
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
-            d_model = self.config.d_model
-            key_value_proj_dim = self.config.d_kv
-            n_heads = self.config.num_heads
-            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
-            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
-            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
-            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
-            if module.has_relative_attention_bias:
-                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (T5Attention, VT5Stack)):
-            module.gradient_checkpointing = value
-
-    @classmethod
-    def override_vision_model_wrapper(cls, model, config, vision_model_name, vision_model_params, torch_dtype):
-        # this can be called via from_pretrained from a class w/ head or w/o head so we extract the beheaded model version
-        beheaded_model = model.encoder
-        cls.override_vision_model(beheaded_model, vision_model_name, vision_model_params, torch_dtype)
-        beheaded_model.freeze_relevant_params(config)
-
-    def _shift_right(self, input_ids):
-        decoder_start_token_id = self.config.decoder_start_token_id
-        pad_token_id = self.config.pad_token_id
-
-        assert decoder_start_token_id is not None, (
-            "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
-            " See T5 docs for more information"
-        )
-
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = decoder_start_token_id
-
-        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
-        # replace possible -100 values in labels by `pad_token_id`
-        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
-
-        return shifted_input_ids
-
-
-class VT5Stack(VT5PreTrainedModel):
-    def __init__(self, config, embed_tokens=None, vision_model=None):
-        super().__init__(config)
-
-        self.embed_tokens = embed_tokens
-        self.is_decoder = config.is_decoder
-
-        self.block = nn.ModuleList(
-            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-        )
-        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-        if not self.is_decoder:
-            self.cross_layer_interval = config.cross_layer_interval
-            num_cross_layers = config.num_layers // self.cross_layer_interval
-            self.gated_cross_attn_layers = nn.ModuleList(
-                [VT5GatedCrossAttentionBlock(config) for i in range(num_cross_layers)]
-            )
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-        self.gradient_checkpointing = False
-
-        # Load an uninitialized model and later in from_pretrained will load the pre-trained model -
-        # this solves the losing of weights in `from_pretrained` on the main model
-        if not self.is_decoder:
-            self.vision_model = vision_model
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        self.freeze_relevant_params(config)
-
-    def freeze_relevant_params(self, config=None):
-        if config is None:
-            config = self.config
-
-        if config.freeze_text_layers:
-            self.freeze_text_layers()
-
-        if not self.is_decoder and config.freeze_vision_layers:
-            freeze_model(self.vision_model)
-
-    def freeze_text_layers(self):
-        for module in [self.block, self.final_layer_norm]:
-            freeze_model(module)
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        # Check validity of device_map
-        self.device_map = (
-            get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
-        )
-        assert_device_map(self.device_map, len(self.block))
-        self.model_parallel = True
-        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
-        self.last_device = "cuda:" + str(max(self.device_map.keys()))
-        # Load onto devices
-        for k, v in self.device_map.items():
-            for layer in v:
-                cuda_device = "cuda:" + str(k)
-                self.block[layer] = self.block[layer].to(cuda_device)
-
-        # Set embed_tokens to first layer
-        self.embed_tokens = self.embed_tokens.to(self.first_device)
-        # Set final layer norm to last device
-        self.final_layer_norm = self.final_layer_norm.to(self.last_device)
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.model_parallel = False
-        self.device_map = None
-        self.first_device = "cpu"
-        self.last_device = "cpu"
-        for i in range(len(self.block)):
-            self.block[i] = self.block[i].to("cpu")
-        self.embed_tokens = self.embed_tokens.to("cpu")
-        self.final_layer_norm = self.final_layer_norm.to("cpu")
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, new_embeddings):
-        self.embed_tokens = new_embeddings
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        inputs_embeds=None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        head_mask=None,
-        cross_attn_head_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        # Model parallel
-        if self.model_parallel:
-            torch.cuda.set_device(self.first_device)
-            self.embed_tokens = self.embed_tokens.to(self.first_device)
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            err_msg_prefix = "decoder_" if self.is_decoder else ""
-            raise ValueError(
-                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            err_msg_prefix = "decoder_" if self.is_decoder else ""
-            raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
-
-        if inputs_embeds is None:
-            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        batch_size, seq_length = input_shape
-
-        # required mask seq length can be calculated via length of past
-        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
-
-        if use_cache is True:
-            assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder"
-
-        if attention_mask is None:
-            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
-        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
-            encoder_seq_length = encoder_hidden_states.shape[1]
-            encoder_attention_mask = torch.ones(
-                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
-            )
-
-        # initialize past_key_values with `None` if past does not exist
-        if past_key_values is None:
-            past_key_values = [None] * len(self.block)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device)
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
-        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
-        present_key_value_states = () if use_cache else None
-        all_hidden_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-        position_bias = None
-        encoder_decoder_position_bias = None
-
-        hidden_states = self.dropout(inputs_embeds)
-
-        if not self.is_decoder:
-            pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
-            batch_size, num_images = pixel_values.size(0), pixel_values.size(1)
-            pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
-            # Get sequence from the vision encoder
-            image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state
-            image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2)
-            image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size)
-            # Make image_attention_mask compatible with hidden states
-            # TODO: these steps are overly complex -> refactor
-            text_seq_len = image_attention_mask.size(1)
-            image_attention_mask = image_attention_mask.unsqueeze(-1)
-            image_attention_mask = image_attention_mask.repeat(1, 1, 1, image_seq_len)
-            image_attention_mask = image_attention_mask.view(batch_size, text_seq_len, num_images * image_seq_len)
-
-            if image_hidden_states is not None:
-                image_batch_size, image_sequence_length, _ = image_hidden_states.size()
-                image_hidden_shape = (image_batch_size, image_sequence_length)
-                if image_attention_mask is None:
-                    image_attention_mask = torch.ones(image_hidden_shape, device=hidden_states.device)
-                extended_image_attention_mask = self.invert_attention_mask(image_attention_mask)
-            else:
-                extended_image_attention_mask = None
-
-        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
-            layer_head_mask = head_mask[i]
-            cross_attn_layer_head_mask = cross_attn_head_mask[i]
-            # Model parallel
-            if self.model_parallel:
-                torch.cuda.set_device(hidden_states.device)
-                # Ensure that attention_mask is always on the same device as hidden_states
-                if attention_mask is not None:
-                    attention_mask = attention_mask.to(hidden_states.device)
-                if position_bias is not None:
-                    position_bias = position_bias.to(hidden_states.device)
-                if encoder_hidden_states is not None:
-                    encoder_hidden_states = encoder_hidden_states.to(hidden_states.device)
-                if encoder_extended_attention_mask is not None:
-                    encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device)
-                if encoder_decoder_position_bias is not None:
-                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device)
-                if layer_head_mask is not None:
-                    layer_head_mask = layer_head_mask.to(hidden_states.device)
-                if cross_attn_layer_head_mask is not None:
-                    cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device)
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            def vblock(
-                main_block,
-                hidden_states,
-                attention_mask,
-                position_bias,
-                encoder_hidden_states,
-                encoder_extended_attention_mask,
-                encoder_decoder_position_bias,
-                layer_head_mask,
-                cross_attn_layer_head_mask,
-                past_key_value,
-                use_cache,
-                output_attentions,
-                image_hidden_states,
-                image_attention_mask,
-                layer_idx,
-                cross_layer_interval,
-                gated_cross_attn_layers,
-                is_decoder,
-            ):
-                if not is_decoder and (layer_idx % cross_layer_interval == 0):
-                    xblock = gated_cross_attn_layers[layer_idx // cross_layer_interval]
-                    outputs = xblock(
-                        hidden_states,
-                        encoder_hidden_states=image_hidden_states,
-                        encoder_attention_mask=image_attention_mask,
-                        encoder_decoder_position_bias=None,
-                        use_cache=use_cache,
-                        output_attentions=output_attentions,
-                    )
-                    hidden_states = outputs[0]
-
-                layer_outputs = main_block(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_bias=position_bias,
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_extended_attention_mask,
-                    encoder_decoder_position_bias=encoder_decoder_position_bias,
-                    layer_head_mask=layer_head_mask,
-                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
-                    past_key_value=past_key_value,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-                return layer_outputs
-
-            if self.is_decoder:
-                self.cross_layer_interval = None
-                self.gated_cross_attn_layers = None
-                image_hidden_states = None
-                extended_image_attention_mask = None
-
-            if self.gradient_checkpointing and self.training:
-                # past_key_value is always None with gradient checkpointing
-                past_key_value = None
-                if use_cache:
-                    logger.warning_once(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
-                layer_outputs = checkpoint(
-                    vblock,
-                    layer_module,
-                    hidden_states,
-                    extended_attention_mask,
-                    position_bias,
-                    encoder_hidden_states,
-                    encoder_extended_attention_mask,
-                    encoder_decoder_position_bias,
-                    head_mask[i],
-                    cross_attn_layer_head_mask,
-                    past_key_value,
-                    use_cache,
-                    output_attentions,
-                    image_hidden_states,
-                    extended_image_attention_mask,
-                    i,
-                    self.cross_layer_interval,
-                    self.gated_cross_attn_layers,
-                    self.is_decoder,
-                )
-            else:
-                layer_outputs = vblock(
-                    layer_module,
-                    hidden_states,
-                    attention_mask=extended_attention_mask,
-                    position_bias=position_bias,
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_extended_attention_mask=encoder_extended_attention_mask,
-                    encoder_decoder_position_bias=encoder_decoder_position_bias,
-                    layer_head_mask=head_mask[i],
-                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
-                    past_key_value=past_key_value,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    image_hidden_states=image_hidden_states,
-                    image_attention_mask=extended_image_attention_mask,
-                    layer_idx=i,
-                    cross_layer_interval=self.cross_layer_interval,
-                    gated_cross_attn_layers=self.gated_cross_attn_layers,
-                    is_decoder=self.is_decoder,
-                )
-
-            # layer_outputs is a tuple with:
-            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-            if use_cache is False:
-                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
-
-            hidden_states, present_key_value_state = layer_outputs[:2]
-
-            # We share the position biases between the layers - the first layer store them
-            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
-            # (cross-attention position bias), (cross-attention weights)
-            position_bias = layer_outputs[2]
-            if self.is_decoder and encoder_hidden_states is not None:
-                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
-            # append next layer key value states
-            if use_cache:
-                present_key_value_states = present_key_value_states + (present_key_value_state,)
-
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[3],)
-                if self.is_decoder:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
-
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-
-        # Add last layer
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    present_key_value_states,
-                    all_hidden_states,
-                    all_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=present_key_value_states,
-            hidden_states=all_hidden_states,
-            attentions=all_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-T5_START_DOCSTRING = r"""
-
-    The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
-    Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
-    Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
-    text-to-text denoising generative setting.
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`T5Config`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-T5_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
-            should be able to pad the inputs on both the right and the left.
-
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for detail.
-
-            [What are input IDs?](../glossary#input-ids)
-
-            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
-        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Indices of decoder input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are decoder input IDs?](../glossary#decoder-input-ids)
-
-            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
-            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
-
-            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
-            Training](./t5#training).
-        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
-            be used by default.
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
-            1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
-            1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
-                `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
-            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
-            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
-            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
-            input (see `past_key_values`). This is useful if you want more control over how to convert
-            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
-
-            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
-            of `inputs_embeds`.
-
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-T5_ENCODER_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
-            should be able to pad the inputs on both the right and the left.
-
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for detail.
-
-            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
-        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-__HEAD_MASK_WARNING_MSG = """
-The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
-`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
-If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
-num_heads)`.
-"""
-
-
-@add_start_docstrings(
-    "The bare T5 Model transformer outputting raw hidden-states without any specific head on top.",
-    T5_START_DOCSTRING,
-)
-class VT5Model(VT5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
-    ]
-
-    def __init__(self, config: T5Config, vision_model=None):
-        super().__init__(config)
-        self.shared = DecoupledEmbedding(
-            num_embeddings=config.vocab_size,
-            num_additional_embeddings=config.additional_vocab_size,
-            embedding_dim=config.d_model,
-            partially_freeze=config.freeze_text_layers,
-        )
-
-        encoder_config = copy.deepcopy(config)
-        encoder_config.is_decoder = False
-        encoder_config.use_cache = False
-        encoder_config.is_encoder_decoder = False
-        self.encoder = VT5Stack(encoder_config, self.shared, vision_model)
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        decoder_config.is_encoder_decoder = False
-        decoder_config.num_layers = config.num_decoder_layers
-        self.decoder = VT5Stack(decoder_config, self.shared)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-        self.freeze_relevant_params(config)
-
-    def freeze_relevant_params(self, config=None):
-        self.encoder.freeze_relevant_params(config)
-        self.decoder.freeze_relevant_params(config)
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
-            if device_map is None
-            else device_map
-        )
-        assert_device_map(self.device_map, len(self.encoder.block))
-        self.encoder.parallelize(self.device_map)
-        self.decoder.parallelize(self.device_map)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.encoder.deparallelize()
-        self.decoder.deparallelize()
-        self.encoder = self.encoder.to("cpu")
-        self.decoder = self.decoder.to("cpu")
-        self.model_parallel = False
-        self.device_map = None
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-        self.encoder.set_input_embeddings(new_embeddings)
-        self.decoder.set_input_embeddings(new_embeddings)
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.BoolTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        decoder_head_mask: Optional[torch.FloatTensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        decoder_inputs_embeds: Optional[torch.Tensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
-        r"""
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import T5Tokenizer, T5Model
-
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        >>> model = T5Model.from_pretrained("t5-small")
-
-        >>> input_ids = tokenizer(
-        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
-        ... ).input_ids  # Batch size 1
-        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-
-        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for T5Model.
-        >>> # This is not needed for torch's T5ForConditionalGeneration as it does this internally using labels arg.
-        >>> decoder_input_ids = model._shift_right(decoder_input_ids)
-
-        >>> # forward pass
-        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        >>> last_hidden_states = outputs.last_hidden_state
-        ```"""
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-        if head_mask is not None and decoder_head_mask is None:
-            if self.config.num_layers == self.config.num_decoder_layers:
-                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
-                decoder_head_mask = head_mask
-
-        # Encode if needed (training, first prediction pass)
-        if encoder_outputs is None:
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                pixel_values=pixel_values,
-                image_attention_mask=image_attention_mask,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-            )
-
-        hidden_states = encoder_outputs[0]
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-            hidden_states = hidden_states.to(self.decoder.first_device)
-            if decoder_input_ids is not None:
-                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(self.decoder.first_device)
-            if decoder_attention_mask is not None:
-                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
-
-        # Decode
-        decoder_outputs = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            inputs_embeds=decoder_inputs_embeds,
-            past_key_values=past_key_values,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        if not return_dict:
-            return decoder_outputs + encoder_outputs
-
-        return Seq2SeqModelOutput(
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-        )
-
-
-@add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
-class VT5ForConditionalGeneration(VT5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"lm_head.weight",
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
-    ]
-
-    def __init__(self, config: T5Config, vision_model=None):
-        super().__init__(config)
-        self.model_dim = config.d_model
-
-        self.shared = DecoupledEmbedding(
-            num_embeddings=config.vocab_size,
-            num_additional_embeddings=config.additional_vocab_size,
-            embedding_dim=config.d_model,
-            partially_freeze=config.freeze_text_layers,
-        )
-
-        encoder_config = copy.deepcopy(config)
-        encoder_config.is_decoder = False
-        encoder_config.use_cache = False
-        encoder_config.is_encoder_decoder = False
-        self.encoder = VT5Stack(encoder_config, self.shared, vision_model=vision_model)
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        decoder_config.is_encoder_decoder = False
-        decoder_config.num_layers = config.num_decoder_layers
-        self.decoder = VT5Stack(decoder_config, self.shared)
-
-        self.lm_head = DecoupledLinear(
-            in_features=config.d_model,
-            out_features=config.vocab_size,
-            out_additional_features=config.additional_vocab_size,
-            bias=False,
-            partially_freeze=config.freeze_lm_head,
-        )
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-        self.freeze_relevant_params(config)
-
-    def freeze_relevant_params(self, config=None):
-        self.encoder.freeze_relevant_params(config)
-        self.decoder.freeze_relevant_params(config)
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
-            if device_map is None
-            else device_map
-        )
-        assert_device_map(self.device_map, len(self.encoder.block))
-        self.encoder.parallelize(self.device_map)
-        self.decoder.parallelize(self.device_map)
-        self.lm_head = self.lm_head.to(self.decoder.first_device)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.encoder.deparallelize()
-        self.decoder.deparallelize()
-        self.encoder = self.encoder.to("cpu")
-        self.decoder = self.decoder.to("cpu")
-        self.lm_head = self.lm_head.to("cpu")
-        self.model_parallel = False
-        self.device_map = None
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-        self.encoder.set_input_embeddings(new_embeddings)
-        self.decoder.set_input_embeddings(new_embeddings)
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def tie_weights(self):
-        """
-        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
-        """
-        output_embeddings = self.get_output_embeddings()
-        input_embeddings = self.get_input_embeddings()
-
-        if getattr(self.config, "tie_word_embeddings", True):
-            output_embeddings.weight = input_embeddings.weight
-            if input_embeddings.num_additional_embeddings > 0:
-                assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
-                output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
-
-        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
-            output_embeddings.out_features = input_embeddings.num_embeddings
-            if hasattr(output_embeddings, "out_additional_features") and hasattr(
-                input_embeddings, "num_additional_embeddings"
-            ):
-                output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.BoolTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        decoder_head_mask: Optional[torch.FloatTensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
-            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
-            labels in `[0, ..., config.vocab_size]`
-
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
-        >>> # training
-        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
-        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
-        >>> outputs = model(input_ids=input_ids, labels=labels)
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
-
-        >>> # inference
-        >>> input_ids = tokenizer(
-        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
-        ... ).input_ids  # Batch size 1
-        >>> outputs = model.generate(input_ids)
-        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-        >>> # studies have shown that owning a dog is good for you.
-        ```"""
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-        if head_mask is not None and decoder_head_mask is None:
-            if self.config.num_layers == self.config.num_decoder_layers:
-                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
-                decoder_head_mask = head_mask
-
-        # Encode if needed (training, first prediction pass)
-        if encoder_outputs is None:
-            # Convert encoder inputs in embeddings if needed
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                pixel_values=pixel_values,
-                image_attention_mask=image_attention_mask,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-            )
-
-        hidden_states = encoder_outputs[0]
-
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-
-        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
-            # get decoder inputs from shifting lm labels to the right
-            decoder_input_ids = self._shift_right(labels)
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-            hidden_states = hidden_states.to(self.decoder.first_device)
-            if decoder_input_ids is not None:
-                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(self.decoder.first_device)
-            if decoder_attention_mask is not None:
-                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
-
-        # Decode
-        decoder_outputs = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            inputs_embeds=decoder_inputs_embeds,
-            past_key_values=past_key_values,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = decoder_outputs[0]
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.encoder.first_device)
-            self.lm_head = self.lm_head.to(self.encoder.first_device)
-            sequence_output = sequence_output.to(self.lm_head.weight.device)
-
-        if self.config.tie_word_embeddings:
-            # Rescale output before projecting on vocab
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-            sequence_output = sequence_output * (self.model_dim**-0.5)
-
-        lm_logits = self.lm_head(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
-            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
-
-        if not return_dict:
-            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
-            return ((loss,) + output) if loss is not None else output
-
-        return Seq2SeqLMOutput(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs,
-    ):
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            "decoder_input_ids": input_ids,
-            "past_key_values": past,
-            "encoder_outputs": encoder_outputs,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,
-        }
-
-    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return self._shift_right(labels)
-
-    def _reorder_cache(self, past, beam_idx):
-        # if decoder past is not included in output
-        # speedy decoding is disabled and no need to reorder
-        if past is None:
-            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
-            return past
-
-        reordered_decoder_past = ()
-        for layer_past_states in past:
-            # get the correct batch idx from layer past batch dim
-            # batch dim of `past` is at 2nd position
-            reordered_layer_past_states = ()
-            for layer_past_state in layer_past_states:
-                # need to set correct `past` for each of the four key / value states
-                reordered_layer_past_states = reordered_layer_past_states + (
-                    layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
-                )
-
-            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
-            assert len(reordered_layer_past_states) == len(layer_past_states)
-
-            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
-        return reordered_decoder_past
-
-    def get_model_tflops_per_batch_per_gpu(self, hparams, data_param, tokenizer, max_num_images):
-        config_vl_model = self.config
-
-        vloom_embed_size = config_vl_model.d_model
-        vision_config = self.encoder.vision_model.config
-        num_language_layers = config_vl_model.num_layers
-        ffn_inner_size = config_vl_model.d_ff
-
-        # Get vision model blocks infos
-        vision_patch_size = vision_config.patch_size
-        vision_hidden_size = vision_config.hidden_size
-        num_vision_layers = vision_config.num_hidden_layers
-        # The +1 is for the CLS token
-        single_image_seq_len = (vision_config.image_size // vision_patch_size) ** 2 + 1
-        vision_exp_factor = vision_config.intermediate_size // vision_hidden_size
-
-        # Get language and cross-att blocks infos
-        num_cross_attn_layers = num_language_layers // config_vl_model.cross_layer_interval
-        language_seq_len = data_param.max_seq_len
-        language_exp_factor = (ffn_inner_size // vloom_embed_size) if ffn_inner_size is not None else 4
-        cross_att_exp_factor = (ffn_inner_size // vloom_embed_size) if ffn_inner_size is not None else 4
-
-        encoder_language_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_language_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=language_seq_len,
-            k_seq_len=language_seq_len,
-            hidden_size=vloom_embed_size,
-            kv_in_dim=vloom_embed_size,
-            ff_exp_factor=language_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            vocab_size=None,
-            count_backward=True,  # Always True regardless of freezing, because gradients are computed for cross-attentions
-            use_grad_checkpointing=False,
-        )
-        image_gated_cross_attention_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_cross_attn_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=language_seq_len,
-            k_seq_len=single_image_seq_len * max_num_images,
-            hidden_size=vloom_embed_size,
-            kv_in_dim=vision_hidden_size,
-            ff_exp_factor=cross_att_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            vocab_size=None,
-            count_backward=True,
-            use_grad_checkpointing=False,
-        )
-        vision_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=num_vision_layers,
-            batch_size=hparams.batch_size_per_gpu * max_num_images,
-            q_seq_len=single_image_seq_len,
-            k_seq_len=single_image_seq_len,
-            hidden_size=vision_hidden_size,
-            kv_in_dim=vision_hidden_size,
-            ff_exp_factor=vision_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            vocab_size=None,
-            count_backward=not hparams.model_params["freeze_vision_layers"],
-            use_grad_checkpointing=False,
-        )
-
-        _, target_seq_len = random_spans_helper(
-            inputs_length=data_param.max_seq_len,
-            noise_density=data_param.t5_mlm_noise_density,
-            mean_noise_span_length=data_param.t5_mlm_mean_noise_span_length,
-            extra_tokens_per_span_inputs=1,
-            extra_tokens_per_span_targets=1,
-            verbose=False,
-        )
-
-        decoder_language_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=config_vl_model.num_decoder_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=target_seq_len,
-            k_seq_len=target_seq_len,
-            hidden_size=vloom_embed_size,
-            kv_in_dim=vloom_embed_size,
-            ff_exp_factor=language_exp_factor,
-            grad_acc_size=hparams.grad_acc_size,
-            vocab_size=tokenizer.vocab_size,
-            count_backward=True,  # Always True regardless of freezing, because gradients are computed for cross-attentions
-            use_grad_checkpointing=False,
-        )
-
-        encoder_decoder_cross_attention_tflops_per_batch_per_gpu = compute_tflops_per_batch_per_gpu(
-            num_layers=config_vl_model.num_decoder_layers,
-            batch_size=hparams.batch_size_per_gpu,
-            q_seq_len=target_seq_len,
-            k_seq_len=language_seq_len,
-            hidden_size=vloom_embed_size,
-            kv_in_dim=vloom_embed_size,
-            ff_exp_factor=0,  # There is only one pair of expansion linear layers per pair of self attention and cross attention blocks
-            grad_acc_size=hparams.grad_acc_size,
-            vocab_size=None,
-            count_backward=True,
-            use_grad_checkpointing=False,
-        )
-        return (
-            encoder_language_tflops_per_batch_per_gpu
-            + image_gated_cross_attention_tflops_per_batch_per_gpu
-            + vision_tflops_per_batch_per_gpu
-            + decoder_language_tflops_per_batch_per_gpu
-            + encoder_decoder_cross_attention_tflops_per_batch_per_gpu
-        )
diff --git a/m4/models/zero_checkpoint_to_hf.py b/m4/models/zero_checkpoint_to_hf.py
deleted file mode 100755
index b48820a4df7b5aee3d37746ef6346816bee8be37..0000000000000000000000000000000000000000
--- a/m4/models/zero_checkpoint_to_hf.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env python
-
-# This script combines the 2 steps of
-# 1. calling zero_to_fp32.py to reconsolidate the shared deepspeed checkpoint
-# 2. then resaving it as HF checkpoint, which also takes care of sharding large checkpoints
-#
-# example usage:
-#
-# this will generate the converted checkpoint under save_dir/opt_step-40/unwrapped_model
-#
-# ./m4/models/zero_checkpoint_to_hf.py save_dir/opt_step-40
-#
-# or you can override the destination by passing an explicit target dir, e.g.:
-#
-# ./m4/models/zero_checkpoint_to_hf.py save_dir/opt_step-40 save_dir/opt_step-40/output_dir
-
-import argparse
-import sys
-from pathlib import Path
-
-import torch
-from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
-
-
-# auto-append the repo path to load m4 modules from instead of needing to set PYTHONPATH
-repodir = str(Path(__file__).resolve().parents[2])
-sys.path.insert(0, repodir)
-
-import m4.models
-from m4.testing_utils import read_json_file
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "checkpoint_dir", type=str, help="path to the desired checkpoint folder, e.g., path/to/opt_step-100"
-    )
-    parser.add_argument(
-        "output_dir",
-        type=str,
-        nargs="?",
-        help="path to pass to save_pretrained, defaults to 'unwrapped_model' relative to the checkpoint_dir argument",
-    )
-    args = parser.parse_args()
-
-    checkpoint_dir = Path(args.checkpoint_dir)
-    config_dir = checkpoint_dir / "unwrapped_model"
-    ds_checkpoint_dir = checkpoint_dir / "accelerator_state"
-    config_file_path = config_dir / "config.json"
-
-    if args.output_dir is None:
-        output_dir = checkpoint_dir / "unwrapped_model"
-    else:
-        output_dir = args.output_dir
-
-    config = read_json_file(config_file_path)
-    config_class = m4.models._SUPPORTED_MODELS.get(config["model_type"], None)
-    if config_class is None:
-        raise ValueError(f"{config['model_type']=} isn't supported by m4")
-    modeling_class = m4.models.model_type_to_modeling_class.get(config["model_type"], None)
-
-    print(f"Detected {config_class}")
-
-    print("Reconsolidating fp32 model from checkpoint shards (can take a long time)")
-    state_dict = get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)  # already on cpu
-
-    # Keeping debug to use if you ever need to debug state dict
-    # print("Saved State Dict")
-    # for k, v in state_dict.items():
-    #     print(f"{k} {v.shape}")
-
-    kwargs = {}
-    print(f"Loading config from {config_dir}")
-    model_config = config_class.from_pretrained(config_dir)
-
-    print(f"Instantiating a {modeling_class} model in bf16")
-    model = modeling_class.from_pretrained(
-        None, config=model_config, state_dict=state_dict, torch_dtype=torch.bfloat16
-    )
-
-    # Keeping debug to use if you ever need to debug state dict
-    # print("Model State Dict")
-    # for k, v in model.state_dict().items():
-    #     print(f"{k} {v.shape}")
-
-    print(f"Saving model to {output_dir}")
-    model.save_pretrained(output_dir)
diff --git a/m4/scripts/cleanup-checkpoints.py b/m4/scripts/cleanup-checkpoints.py
deleted file mode 100755
index e7333e8b74d4555776bff3b3951deec6c61a1c0f..0000000000000000000000000000000000000000
--- a/m4/scripts/cleanup-checkpoints.py
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/env python
-
-#
-# This tool deletes checkpoints found at given path that are no longer needed
-#
-# we have 2 parts to each checkpoints to cleanup
-#
-# 1. the original deepspeed checkpoint
-# 2. the converted hf checkpoint
-#
-# we will start with a combined requirement for eval to be completed and s3 synced to nuke the checkpoint
-#
-# Example:
-#
-# ./cleanup-checkpoints.py checkpoints-path
-#
-# Use `-h` for more options
-
-import argparse
-import shutil  # noqa
-import subprocess
-import sys
-import time
-from pathlib import Path
-
-
-repo_path = Path(__file__).parents[2]
-
-# we have to deal with potentially overlapping slurm jobs running on different nodes, so we can't
-# rely on PIDs of a running process. Will use a control file instead as the filesystem is shared.
-#
-# If that file is there it means:
-#
-# 1. either the cleanup is still running
-# 2. the cleanup got aborted (e.g. cpu-oom)
-#
-# to detect aborted cleanups we will check if the control file is older than a reasonable time to perform such a cleanup
-control_file_name = "started-cleanup-checkpoint"
-finished_uploading_file_name = "finished-upload-checkpoint"
-# should fine tune - but surely 1h per checkpoint is plenty
-reasonable_cleanup_time_in_secs = 1 * 60 * 60
-
-
-def run_cmd(cmd, check=True):
-    try:
-        response = subprocess.run(
-            cmd,
-            stderr=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            check=check,
-            encoding="utf-8",
-        ).stdout.strip()
-    except subprocess.CalledProcessError as exc:
-        raise EnvironmentError(exc.stderr)
-
-    return response
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("checkpoints_path", type=str, help="base dir with checkpoints")
-    parser.add_argument("--skip-evals-check", action="store_true", help="skip evals done checks")
-    return parser.parse_args()
-
-
-def exit(msg):
-    print(msg)
-    sys.exit()
-
-
-def should_process(path, control_file_path, args):
-    """Heuristics to decide whether to cleanup this opt_step-XXX checkpoint or not"""
-
-    s3_completed_path = path / finished_uploading_file_name
-    eval_completed_paths = [
-        path / "run_evals_0_shots_done",
-        path / "run_evals_4_shots_done",
-        path / "run_evals_perplexity_validation_done",
-        path / "run_evals_0_shots_a_la_flamingo_done",
-    ]
-
-    # check s3 sync is completed
-    if not s3_completed_path.exists():
-        print(f"[N] {path} hasn't been synced to s3 yet. Skipping")
-        return False
-
-    # check evals are completed
-    if not args.skip_evals_check:
-        for eval_path in eval_completed_paths:
-            if not eval_path.exists():
-                print(f"[N] {path} hasn't been evaled yet. Skipping")
-                return False
-
-    # complicated checks - has another job already started processing? or did it crash?
-    if control_file_path.exists():
-        if control_file_path.stat().st_mtime < time.time() - reasonable_cleanup_time_in_secs:
-            print(f"[Y] {path} looks stale - probably aborted cleanup job. Deleting")
-            return True
-        else:
-            print(
-                f"[N] {path} either another job is doing the cleanup or less than"
-                f" {reasonable_cleanup_time_in_secs} secs has passed since it was launched. Skipping"
-            )
-            return False
-    else:
-        print(f"[Y] {path} completed s3 sync + eval. Deleting")
-        return True
-
-
-def main():
-    args = get_args()
-
-    checkpoints_path = Path(args.checkpoints_path)
-    if not (checkpoints_path.exists() and checkpoints_path.is_dir()):
-        raise FileNotFoundError(f"can't find a directory '{checkpoints_path}'")
-
-    checkpoint_dirs = list(checkpoints_path.glob("opt_step-*"))
-    if len(checkpoint_dirs) == 0:
-        exit("No checkpoints found, exiting")
-
-    # Check each checkpoint folder in real time to allow for overlapping jobs starting at different times
-    # Additionally do not delete the last 2 checkpoints
-    #
-    # sort numerically to sort correctly different number of digits: opt_step-10, opt_step-100
-    checkpoint_dirs_sorted = sorted(checkpoint_dirs, key=lambda x: int(str(x).split("-")[-1]))
-    for i, checkpoint_dir in enumerate(checkpoint_dirs_sorted):
-        print(f"\n*** Checking {checkpoint_dir}")
-
-        if i + 1 == len(checkpoint_dirs_sorted):
-            print(f"[N] {checkpoint_dir} is a last checkpoint. Skipping")
-            continue
-
-        if i + 2 == len(checkpoint_dirs_sorted):
-            print(f"[N] {checkpoint_dir} is a second to last checkpoint. Skipping")
-            continue
-
-        control_file_path = checkpoint_dir / "unwrapped_model" / control_file_name
-
-        if not should_process(checkpoint_dir, control_file_path, args):
-            continue
-
-        print(f"Launching cleanup for {checkpoint_dir}")
-        # we could use flock here, to avoid a race condition, but it'd be pointless since each
-        # cronjob is likely to run on a different node and flock only works within a single node
-        control_file_path.touch()
-
-        # cleanup
-        # XXX: enable the actual delete once tested a lot
-        # The delete should be relatively safe since it'll only run if it finds 2 files:
-        # save_dir/opt_step-XXX/s3_sync_is_completed save_dir/opt_step-XXX/eval_is_completed
-        shutil.rmtree(checkpoint_dir, ignore_errors=True)
-        print(f"Checkpoint {checkpoint_dir} deleted")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/m4/scripts/convert-checkpoints.py b/m4/scripts/convert-checkpoints.py
deleted file mode 100755
index a7a10f663e5b766115bd8ea7a9948c30d2f94b06..0000000000000000000000000000000000000000
--- a/m4/scripts/convert-checkpoints.py
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/usr/bin/env python
-
-#
-# This tool converts any deepspeed checkpoints found at given path to hf format
-#
-# Example:
-#
-# ./convert-checkpoints.py checkpoints-path
-#
-
-import argparse
-import subprocess
-import sys
-import time
-from pathlib import Path
-
-
-repo_path = Path(__file__).parents[2]
-zero_checkpoint_to_hf_path = repo_path / "m4/models/zero_checkpoint_to_hf.py"
-
-# we have to deal with potentially overlapping slurm jobs running on different nodes, so we can't
-# rely on PIDs of a running process. Will use a control file instead as the filesystem is shared.
-#
-# If that file is there it means:
-#
-# 1. either the conversion is still running
-# 2. the conversion got aborted (e.g. cpu-oom)
-#
-# to detect aborted conversions we will check if the control file is older than a reasonable time to perform such a conversion
-control_file_name = "started-convert-checkpoint"
-# should fine tune - but surely 2h per checkpoint is plenty
-reasonable_conversion_time_in_secs = 2 * 60 * 60
-
-
-def run_cmd(cmd, check=True):
-    try:
-        response = subprocess.run(
-            cmd,
-            stderr=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            check=check,
-            encoding="utf-8",
-        ).stdout.strip()
-    except subprocess.CalledProcessError as exc:
-        raise EnvironmentError(exc.stderr)
-
-    return response
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("checkpoints_path", type=str, help="base dir with checkpoints")
-    parser.add_argument("-f", "--force", action="store_true", help="force rebuilding of all checkpoints")
-    return parser.parse_args()
-
-
-def exit(msg):
-    print(msg)
-    sys.exit()
-
-
-def should_process(path, force, control_file_path):
-    """Heuristics to decide whether to convert this opt_step-XXX checkpoint or not"""
-
-    target_dir = path / "unwrapped_model"
-
-    # easy checks - the conversion is clearly completed
-    if (target_dir / "pytorch_model.bin").exists() or (target_dir / "pytorch_model.bin.index.json").exists():
-        print(f"[N] {path} appears to be already converted. Skipping")
-        return False
-
-    if force:
-        print("[Y] Forced to re-convert {checkpoint_dir}")
-        return True
-
-    # complicated checks - has another job already started processing? or did it crash?
-    control_file_path = target_dir / control_file_name
-    if control_file_path.exists():
-        if control_file_path.stat().st_mtime < time.time() - reasonable_conversion_time_in_secs:
-            print(f"[Y] {path} looks stale - probably aborted job. Re-converting")
-            return True
-        else:
-            print(
-                f"[N] {path} either another job is converting it or less than"
-                f" {reasonable_conversion_time_in_secs} secs has passed since it was launched. Skipping"
-            )
-            return False
-    else:
-        print(f"[Y] {path} is a new checkpoint. Converting")
-        return True
-
-
-def main():
-    args = get_args()
-
-    checkpoints_path = Path(args.checkpoints_path)
-    if not (checkpoints_path.exists() and checkpoints_path.is_dir()):
-        raise FileNotFoundError(f"can't find a directory '{checkpoints_path}'")
-
-    checkpoint_dirs = list(checkpoints_path.glob("opt_step-*"))
-    if len(checkpoint_dirs) == 0:
-        exit("No checkpoints found, exiting")
-
-    # Check each folder in real time to allow for overlapping jobs starting at different times
-    for checkpoint_dir in checkpoint_dirs:
-        print(f"\n*** Checking {checkpoint_dir}")
-
-        control_file_path = checkpoint_dir / "unwrapped_model" / control_file_name
-
-        if not should_process(checkpoint_dir, args.force, control_file_path):
-            continue
-
-        print(f"Launching conversion for {checkpoint_dir} - it could take a long time")
-        cmd = [zero_checkpoint_to_hf_path, checkpoint_dir]
-        # we could use flock here, to avoid a race condition, but it'd be pointless since each
-        # cronjob is likely to run on a different node and flock only works within a single node
-        control_file_path.touch()
-        response = run_cmd(cmd)
-        control_file_path.unlink()
-        print(response)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/m4/scripts/s3-upload-checkpoints.py b/m4/scripts/s3-upload-checkpoints.py
deleted file mode 100755
index f536132ddecebab484b44fa830f4cc148f526f5d..0000000000000000000000000000000000000000
--- a/m4/scripts/s3-upload-checkpoints.py
+++ /dev/null
@@ -1,194 +0,0 @@
-#!/usr/bin/env python
-
-#
-# This tool uploads any new deepspeed checkpoints found at given path to s3 (and also various non-checkpoint files, like logs)
-#
-# Example:
-#
-# ./s3-upload-checkpoints.py checkpoints-path
-#
-# Use `-h` for more options
-#
-
-
-import argparse
-import subprocess
-import sys
-import time
-from pathlib import Path
-
-
-repo_path = Path(__file__).resolve().parents[2]
-zero_checkpoint_to_hf_path = repo_path / "m4/models/zero_checkpoint_to_hf.py"
-
-RETRIES = 5
-
-# what dir/file glob patterns to include in the upload besides checkpoints
-include_patterns = ["tb_run_*", "logs", "config.yaml"]
-
-
-# we have to deal with potentially overlapping slurm jobs running on different nodes, so we can't
-# rely on PIDs of a running process. Will use a control file instead as the filesystem is shared.
-#
-# If that file is there it means:
-#
-# 1. either the upload is still running
-# 2. the upload got aborted (e.g. cpu-oom)
-#
-# to detect aborted uploads we will check if the control file is older than a reasonable time to perform such a upload
-control_file_name = "started-upload-checkpoint"
-finished_uploading_file_name = "finished-upload-checkpoint"
-# should fine tune - but surely 2h per checkpoint is plenty
-reasonable_upload_time_in_secs = 2 * 60 * 60
-
-
-def run_cmd(cmd, check=True):
-    try:
-        response = subprocess.run(
-            cmd,
-            stderr=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            check=check,
-            encoding="utf-8",
-        ).stdout.strip()
-    except subprocess.CalledProcessError as exc:
-        raise EnvironmentError(exc.stderr)
-
-    return response
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("checkpoints_path", type=str, help="base dir with checkpoints")
-    # parser.add_argument("experiment_name", type=str, help="experiment name as a s3 sub-dir")
-    parser.add_argument("-f", "--force", action="store_true", help="force uploading of all checkpoints")
-    parser.add_argument(
-        "--skip-conversion-check", action="store_true", help="skip checkpoint conversion is done check"
-    )
-    return parser.parse_args()
-
-
-def exit(msg):
-    print(msg)
-    sys.exit()
-
-
-def should_process(path, force, control_file_path, finished_uploading_file_path, args):
-    """Heuristics to decide whether to upload this opt_step-XXX checkpoint or not"""
-
-    # check if checkpoint is fully saved
-    finished_saving_path = path / "finished-saving"  # defined in from trainer.py
-    if not finished_saving_path.exists():
-        print(f"[N] {path} isn't finished saving. Skipping")
-        return False
-
-    if force:
-        print("[Y] Forced to re-process {checkpoint_dir}")
-        return True
-
-    # check if already uploaded
-    if finished_uploading_file_path.exists():
-        print(f"[N] {path} has already been uploaded. Skipping")
-        return False
-
-    # check conversion is completed
-    if not args.skip_conversion_check:
-        converted_model_path_1 = path / "unwrapped_model" / "pytorch_model.bin.index.json"
-        converted_model_path_2 = path / "unwrapped_model" / "pytorch_model.bin"
-        if not converted_model_path_1.exists() and not converted_model_path_2.exists():
-            print(f"[N] {path} doesn't have a converted model. Skipping")
-            return False
-
-    # complicated checks - has another job already started uploading? or did it crash?
-    if control_file_path.exists():
-        if control_file_path.stat().st_mtime < time.time() - reasonable_upload_time_in_secs:
-            print(f"[Y] {path} looks stale - probably aborted job. Re-uploading")
-            return True
-        else:
-            print(
-                f"[N] {path} either another job is uploading it or less than"
-                f" {reasonable_upload_time_in_secs} secs has passed since it was launched. Skipping"
-            )
-            return False
-    else:
-        print(f"[Y] {path} is a new checkpoint. Uploading")
-        return True
-
-
-def main():
-    args = get_args()
-
-    checkpoints_path = Path(args.checkpoints_path)
-    if not (checkpoints_path.exists() and checkpoints_path.is_dir()):
-        raise FileNotFoundError(f"can't find a directory '{checkpoints_path}'")
-
-    checkpoint_dirs = list(checkpoints_path.glob("opt_step-*"))
-    if len(checkpoint_dirs) == 0:
-        exit("No checkpoints found, exiting")
-
-    exp_name = checkpoints_path.name
-
-    # Check each folder in real time to allow for overlapping jobs starting at different times
-    for checkpoint_dir in checkpoint_dirs:
-        print(f"\n*** Checking {checkpoint_dir}")
-
-        control_file_path = checkpoint_dir / control_file_name
-        finished_uploading_file_path = checkpoint_dir / finished_uploading_file_name
-
-        if not should_process(checkpoint_dir, args.force, control_file_path, finished_uploading_file_path, args):
-            continue
-
-        opt_step = checkpoint_dir.name
-        bucket_name = "m4-exps"
-        bucket_path = f"{exp_name}/{opt_step}"
-
-        print(f"Launching upload for {checkpoint_dir} - it could take a long time")
-        cmd = f"s5cmd sync {checkpoint_dir}/ s3://{bucket_name}/{bucket_path}/".split()
-        # we could use flock here, to avoid a race condition, but it'd be pointless since each
-        # cronjob is likely to run on a different node and flock only works within a single node
-        control_file_path.touch()
-        # print(f"mock running {cmd}")
-
-        # s5cmd will fail with an error like this when MD5 checksum doesn't match on upload (it won't retry)
-        # ERROR "cp data4.tar s3://m4-datasets/cm4-test/data4.tar": InvalidDigest: The Content-MD5
-        # you specified was invalid. status code: 400, request id: SZEHBJ4QQ33JSMH7, host id:
-        # XTeMYKd2KECiVKbFnwVbXo3LgnuA2OHWk5S+tHKAOKO95Os/pje2ZEbCfO5pojQtCTFOovvnVME=
-
-        tries = 0
-        while tries < RETRIES:
-            tries += 1
-            try:
-                response = run_cmd(cmd)
-                print(response)
-                break
-            except EnvironmentError as e:
-                if "InvalidDigest" in str(e):
-                    print(f"MD5 checksum failed, upload retry {tries}")
-                    continue
-            except Exception:
-                # some other possible failure?
-                raise
-
-        # for now disable this as large files don't have sha256 checksums
-        # result = integrity_check_recursive(checkpoint_dir, bucket_name, bucket_path)
-        # print(f"Integrity check was {result}")
-
-        control_file_path.unlink()
-        finished_uploading_file_path.touch()
-
-    # now upload non-checkpoint files
-    print("\n*** Uploading non-checkpoint files")
-    upload_dirs = []
-    for pat in include_patterns:
-        upload_dirs += list(checkpoints_path.glob(pat))
-
-    for dir in upload_dirs:
-        print(f"Launching upload for {dir}")
-        cmd = f"s5cmd sync {dir} s3://m4-exps/{exp_name}/".split()
-        print(f"running {cmd}")
-        response = run_cmd(cmd)
-        print(response)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/m4/scripts/s3_checkpoint_download_convert_upload.py b/m4/scripts/s3_checkpoint_download_convert_upload.py
deleted file mode 100644
index 2ba16a7796c0fc9647cad9b51ba1a2b1eff47fa6..0000000000000000000000000000000000000000
--- a/m4/scripts/s3_checkpoint_download_convert_upload.py
+++ /dev/null
@@ -1,171 +0,0 @@
-#!/usr/bin/env python
-
-#
-# This tool converts any deepspeed checkpoints found at given path to hf format
-#
-# Example:
-#
-# ./convert-checkpoints.py checkpoints-path
-#
-
-import argparse
-import subprocess
-import sys
-from pathlib import Path
-
-import boto3
-
-
-def check_s3_directory(directory_path):
-    s3 = boto3.client("s3")
-
-    # Add a trailing slash to the directory path
-    if not directory_path.endswith("/"):
-        directory_path += "/"
-
-    # Check if any objects exist with the given directory prefix
-    response = s3.list_objects_v2(Bucket="m4-exps", Prefix=directory_path)
-
-    # If any objects are found, the directory exists
-    if "Contents" in response:
-        return True
-
-    return False
-
-
-def check_s3_file(file_key):
-    s3 = boto3.client("s3")
-
-    try:
-        s3.head_object(Bucket="m4-exps", Key=file_key)
-        return True
-    except Exception:
-        return False
-
-
-def run_cmd(cmd, check=True):
-    try:
-        response = subprocess.run(
-            cmd,
-            stderr=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            check=check,
-            encoding="utf-8",
-        ).stdout.strip()
-    except subprocess.CalledProcessError as exc:
-        raise EnvironmentError(exc.stderr)
-
-    return response
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("run_name", type=str, help="run name")
-    parser.add_argument("opt_step_num_list", nargs="+", help="list of opt-steps to download")
-    parser.add_argument("repo_path", type=str, help="repo path")
-
-    parser.add_argument("-f", "--force", action="store_true", help="force rebuilding of all checkpoints")
-    return parser.parse_args()
-
-
-def exit(msg):
-    print(msg)
-    sys.exit()
-
-
-def cmd_retry_loop(cmd, max_retries=5):
-    # s5cmd will fail with an error like this when MD5 checksum doesn't match on upload (it won't retry)
-    # ERROR "cp data4.tar s3://m4-datasets/cm4-test/data4.tar": InvalidDigest: The Content-MD5
-    # you specified was invalid. status code: 400, request id: SZEHBJ4QQ33JSMH7, host id:
-    # XTeMYKd2KECiVKbFnwVbXo3LgnuA2OHWk5S+tHKAOKO95Os/pje2ZEbCfO5pojQtCTFOovvnVME=
-
-    tries = 0
-    while tries < max_retries:
-        tries += 1
-        try:
-            response = run_cmd(cmd)
-            print(response)
-            break
-        except EnvironmentError as e:
-            if "InvalidDigest" in str(e):
-                print(f"MD5 checksum failed, download retry {tries}")
-                continue
-        except Exception:
-            # some other possible failure?
-            raise
-    return response
-
-
-def main():
-    args = get_args()
-
-    run_name = args.run_name
-    opt_step_num_list = args.opt_step_num_list
-    repo_path = Path(args.repo_path)
-    zero_checkpoint_to_hf_path = repo_path / "m4/models/zero_checkpoint_to_hf.py"
-    bucket_name = "m4-exps"
-    opt_step_s3_file_keys = [f"{run_name}/opt_step-{opt_step_num}" for opt_step_num in opt_step_num_list]
-
-    check_s3_directory(run_name)
-
-    # Check each folder in real time to allow for overlapping jobs starting at different times
-    for opt_step_s3_file_key in opt_step_s3_file_keys:
-        print(f"\n*** Checking {opt_step_s3_file_key}")
-        if not check_s3_directory(opt_step_s3_file_key):
-            print(f"The checkpoint {opt_step_s3_file_key} does not exist - skipping")
-            continue
-        unwrapped_model_s3_file_key = f"{opt_step_s3_file_key}/unwrapped_model"
-        bin_s3_file_key = f"{unwrapped_model_s3_file_key}/pytorch_model.bin"
-        index_s3_file_key = f"{unwrapped_model_s3_file_key}/pytorch_model.bin.index.json"
-        is_not_converted = not check_s3_file(bin_s3_file_key) and not check_s3_file(index_s3_file_key)
-        if is_not_converted:
-            print(
-                f"The checkpoint hasn't been converted, launching download for {opt_step_s3_file_key} - it could take"
-                " a long time"
-            )
-
-            opt_step_dirname = opt_step_s3_file_key.split("/")[-1]
-            cluster_opt_step_dir = f"/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/{run_name}/{opt_step_dirname}"
-            cmd = f"s5cmd sync s3://{bucket_name}/{opt_step_s3_file_key}/* {cluster_opt_step_dir}".split()
-            download_response_opt_step_dir = cmd_retry_loop(cmd, max_retries=5)
-            print(f"download_response_opt_step_dir: {download_response_opt_step_dir}")
-        else:
-            print(
-                "The checkpoint has been converted already, downloading only the unwrapped checkpoint and"
-                " tokenizer dir"
-            )
-            opt_step_dirname = opt_step_s3_file_key.split("/")[-1]
-            cluster_opt_step_dir = f"/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/{run_name}/{opt_step_dirname}"
-            unwrapped_model_dir = f"{cluster_opt_step_dir}/unwrapped_model"
-            tokenizer_dir = f"{cluster_opt_step_dir}/tokenizer"
-            cmd_model = (
-                f"s5cmd sync s3://{bucket_name}/{opt_step_s3_file_key}/unwrapped_model/* {unwrapped_model_dir}".split()
-            )
-            cmd_tokenizer = f"s5cmd sync s3://{bucket_name}/{opt_step_s3_file_key}/tokenizer/* {tokenizer_dir}".split()
-            download_response_model = cmd_retry_loop(cmd_model, max_retries=5)
-            print(f"download_response_model: {download_response_model}")
-            download_response_tokenizer = cmd_retry_loop(cmd_tokenizer, max_retries=5)
-            print(f"download_response_tokenizer: {download_response_tokenizer}")
-
-        print(f"opt_step_dirname: {opt_step_dirname} downloaded to cluster_opt_step_dir: {cluster_opt_step_dir}")
-
-        if is_not_converted:
-            print(f"Converting {cluster_opt_step_dir}")
-            convert_cmd = [zero_checkpoint_to_hf_path, cluster_opt_step_dir]
-            conversion_response = run_cmd(convert_cmd)
-            print(f"conversion_response: {conversion_response}")
-            print(f"upload converted checkpoint: {cluster_opt_step_dir}")
-            upload_cmd = (
-                f"s5cmd sync {cluster_opt_step_dir}/unwrapped_model/"
-                f" s3://{bucket_name}/{opt_step_s3_file_key}/unwrapped_model/ ".split()
-            )
-            upload_response = cmd_retry_loop(upload_cmd, max_retries=5)
-            print(f"upload_response: {upload_response}")
-            print(
-                f"Uploaded {cluster_opt_step_dir}/unwrapped_model to"
-                f" s3://{bucket_name}/{opt_step_s3_file_key}/unwrapped_model"
-            )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/m4/scripts/s3_checkpoint_download_convert_upload.slurm b/m4/scripts/s3_checkpoint_download_convert_upload.slurm
deleted file mode 100644
index 0b2e019bd111f373f494f6e11a41d1dd49d14f22..0000000000000000000000000000000000000000
--- a/m4/scripts/s3_checkpoint_download_convert_upload.slurm
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=tr_test-s3-download-and-convert-checkpoints
-#SBATCH --ntasks=1
-#SBATCH --nodes=1
-#SBATCH --time=3:00:00
-#SBATCH --partition=production-cluster
-#SBATCH --output=/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/logs/%x-%j.out
-
-
-set -e
-
-# ----------------- Auto-Workdir -----------------
-if [ -n $SLURM_JOB_ID ];  then
-    # check the original location through scontrol and $SLURM_JOB_ID
-    SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
-else
-    # otherwise: started with bash. Get the real location.
-    SCRIPT_PATH=$(realpath $0)
-fi
-SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
-M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
-
-# --------------------------------------------------
-
-### EDIT ME START ###
-
-CONDA_ENV_NAME=shared-m4
-
-EXPERIMENT_NAME=tr_194_laion_cm4_mix
-
-opt_step_num_list=(
-   "1000"
-   "2000"
-)
-
-### EDIT ME END ###
-
-
-echo "START TIME: $(date)"
-
-source /fsx/m4/start-m4-user
-conda activate base
-conda activate $CONDA_ENV_NAME
-pushd $M4_REPO_PATH
-export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
-
-echo "running checkpoint download, convert, upload for opt-steps: ${opt_step_num_list[@]} of experiment: $EXPERIMENT_NAME"
-
-python $M4_REPO_PATH/m4/scripts/s3_checkpoint_download_convert_upload.py $EXPERIMENT_NAME ${opt_step_num_list[@]} $M4_REPO_PATH
-
-echo "END TIME: $(date)"
diff --git a/m4/scripts/s3_downloaded_checkpoints_cleanup.slurm b/m4/scripts/s3_downloaded_checkpoints_cleanup.slurm
deleted file mode 100644
index 08bf7f4514016b2282d798902c6305947cb2a8b0..0000000000000000000000000000000000000000
--- a/m4/scripts/s3_downloaded_checkpoints_cleanup.slurm
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=tr_test-s3-cleanup-checkpoints
-#SBATCH --ntasks=1
-#SBATCH --nodes=1
-#SBATCH --time=3:00:00
-#SBATCH --partition=production-cluster
-#SBATCH --output=/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/logs/%x-%j.out
-
-
-set -e
-
-# ----------------- Auto-Workdir -----------------
-if [ -n $SLURM_JOB_ID ];  then
-    # check the original location through scontrol and $SLURM_JOB_ID
-    SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
-else
-    # otherwise: started with bash. Get the real location.
-    SCRIPT_PATH=$(realpath $0)
-fi
-SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
-M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
-
-# --------------------------------------------------
-
-### EDIT ME START ###
-
-CONDA_ENV_NAME=shared-m4
-
-EXPERIMENT_NAME=tr_194_laion_cm4_mix
-
-opt_step_num_list=(
-   "1000"
-   "2000"
-)
-
-### EDIT ME END ###
-
-
-echo "START TIME: $(date)"
-
-source /fsx/m4/start-m4-user
-conda activate base
-conda activate $CONDA_ENV_NAME
-pushd $M4_REPO_PATH
-export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
-
-for opt_step_num in ${opt_step_num_list[@]}
-do
-    OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/${EXPERIMENT_NAME}/opt_step-${opt_step_num}"
-    rm -r $OPT_STEP_DIR
-    echo "Deleted $OPT_STEP_DIR of experiment: $EXPERIMENT_NAME"
-done
-
-echo "END TIME: $(date)"
diff --git a/m4/scripts/schedule-evals.py b/m4/scripts/schedule-evals.py
deleted file mode 100755
index b6380b0912cc6c78706876961211c7d368904544..0000000000000000000000000000000000000000
--- a/m4/scripts/schedule-evals.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env python
-
-#
-# This tool checks if evaluation is needed
-#
-
-import argparse
-import os
-import subprocess
-import sys
-import time
-from pathlib import Path
-
-
-repo_path = Path(__file__).parents[2]
-
-# we have to deal with potentially overlapping slurm jobs running on different nodes, so we can't
-# rely on PIDs of a running process. Will use a control file instead as the filesystem is shared.
-#
-# If that file is there it means:
-#
-# 1. either the eval is still running
-# 2. the eval got aborted (e.g. gpu-oom)
-#
-
-# should fine tune - but surely 9h per checkpoint is plenty
-reasonable_eval_time_in_secs = 9 * 60 * 60
-
-
-def run_cmd(cmd, check=True):
-    try:
-        response = subprocess.run(
-            cmd,
-            stderr=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            check=check,
-            encoding="utf-8",
-        ).stdout.strip()
-    except subprocess.CalledProcessError as exc:
-        raise EnvironmentError(exc.stderr)
-
-    return response
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("checkpoints_path", type=str, help="base dir with checkpoints")
-    return parser.parse_args()
-
-
-def exit(msg):
-    print(msg)
-    sys.exit()
-
-
-def check_eval_crash(path):
-    """Heuristics to decide whether to restart this opt_step-XXX checkpoint evaluation or not"""
-    eval_0_completed_path = path / "start_run_evals_0_shots"
-    eval_4_completed_path = path / "start_run_evals_4_shots"
-    eval_perplexity_path = path / "start_run_evals_perplexity_validation"
-    # complicated checks - has another job already started processing? or did it crash?
-    for eval_start_path in [eval_0_completed_path, eval_4_completed_path, eval_perplexity_path]:
-        if eval_start_path.exists():
-            if eval_start_path.stat().st_mtime < time.time() - reasonable_eval_time_in_secs:
-                print(f"[Y] {path} looks stale - Probably crashed - Restart evals")
-                os.remove(eval_start_path)
-
-
-def main():
-    args = get_args()
-
-    checkpoints_path = Path(args.checkpoints_path)
-    if not (checkpoints_path.exists() and checkpoints_path.is_dir()):
-        raise FileNotFoundError(f"can't find a directory '{checkpoints_path}'")
-
-    checkpoint_dirs = list(checkpoints_path.glob("opt_step-*"))
-    if len(checkpoint_dirs) == 0:
-        exit("No checkpoints found, exiting")
-
-    checkpoint_dirs_sorted = sorted(checkpoint_dirs, key=lambda x: int(str(x).split("-")[-1]))
-    for i, checkpoint_dir in enumerate(checkpoint_dirs_sorted):
-        print(f"\n*** Checking {checkpoint_dir} for evals")
-        check_eval_crash(checkpoint_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/m4/testing_utils.py b/m4/testing_utils.py
deleted file mode 100644
index 70d1e22caf003ca1d8bc3174e896ceb7fd347bae..0000000000000000000000000000000000000000
--- a/m4/testing_utils.py
+++ /dev/null
@@ -1,1116 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# adapted from https://stackoverflow.com/a/59041913/9201239
-import asyncio  # noqa
-import contextlib
-import importlib.util
-import inspect
-import json
-import logging
-import os
-import random
-import re
-import shutil
-import sys
-import tempfile
-import unittest
-from distutils.util import strtobool
-from io import StringIO
-from pathlib import Path
-from typing import Iterator, Union
-from unittest import mock
-from unittest.case import SkipTest
-
-import numpy as np
-from packaging import version
-from parameterized import parameterized
-
-
-try:
-    import torch
-
-    _torch_available = True
-except Exception:
-    _torch_available = False
-
-
-def is_torch_available():
-    return _torch_available
-
-
-def parse_flag_from_env(key, default=False):
-    try:
-        value = os.environ[key]
-    except KeyError:
-        # KEY isn't set, default to `default`.
-        _value = default
-    else:
-        # KEY is set, convert it to True or False.
-        try:
-            _value = strtobool(value)
-        except ValueError:
-            # More values are supported, but let's keep the message simple.
-            raise ValueError(f"If set, {key} must be yes or no.")
-    return _value
-
-
-def parse_int_from_env(key, default=None):
-    try:
-        value = os.environ[key]
-    except KeyError:
-        _value = default
-    else:
-        try:
-            _value = int(value)
-        except ValueError:
-            raise ValueError(f"If set, {key} must be a int.")
-    return _value
-
-
-def require_torch(test_case):
-    """
-    Decorator marking a test that requires PyTorch.
-
-    These tests are skipped when PyTorch isn't installed.
-
-    """
-    if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
-    else:
-        return test_case
-
-
-def require_torch_no_gpus(test_case):
-    """
-    Decorator marking a test that requires a setup without GPUs (in PyTorch). These tests are skipped on a machine with GPUs.
-
-    To run *only* the no gpu tests, assuming all test names contain no_gpu: $ pytest -sv ./tests -k "no_gpu"
-    """
-    import torch
-
-    if is_torch_available() and torch.cuda.device_count() > 0:
-        return unittest.skip("test requires an environment w/o GPUs")(test_case)
-    else:
-        return test_case
-
-
-def require_torch_multi_gpu(test_case):
-    """
-    Decorator marking a test that requires a multi-GPU setup (in PyTorch). These tests are skipped on a machine without
-    multiple GPUs.
-
-    To run *only* the multi_gpu tests, assuming all test names contain multi_gpu: $ pytest -sv ./tests -k "multi_gpu"
-    """
-    if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
-
-    import torch
-
-    if torch.cuda.device_count() < 2:
-        return unittest.skip("test requires multiple GPUs")(test_case)
-    else:
-        return test_case
-
-
-def require_torch_non_multi_gpu(test_case):
-    """
-    Decorator marking a test that requires 0 or 1 GPU setup (in PyTorch).
-    """
-    if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
-
-    import torch
-
-    if torch.cuda.device_count() > 1:
-        return unittest.skip("test requires 0 or 1 GPU")(test_case)
-    else:
-        return test_case
-
-
-def require_torch_up_to_2_gpus(test_case):
-    """
-    Decorator marking a test that requires 0 or 1 or 2 GPU setup (in PyTorch).
-    """
-    if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
-
-    import torch
-
-    if torch.cuda.device_count() > 2:
-        return unittest.skip("test requires 0 or 1 or 2 GPUs")(test_case)
-    else:
-        return test_case
-
-
-if is_torch_available():
-    # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
-    torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-else:
-    torch_device = None
-
-
-def require_torch_gpu(test_case):
-    """Decorator marking a test that requires CUDA and PyTorch."""
-    if torch_device != "cuda":
-        return unittest.skip("test requires CUDA")(test_case)
-    else:
-        return test_case
-
-
-def is_deepspeed_available():
-    return importlib.util.find_spec("deepspeed") is not None
-
-
-def require_deepspeed(test_case):
-    """
-    Decorator marking a test that requires deepspeed
-    """
-    if not is_deepspeed_available():
-        return unittest.skip("test requires deepspeed")(test_case)
-    else:
-        return test_case
-
-
-def is_bnb_available():
-    return importlib.util.find_spec("bitsandbytes") is not None
-
-
-def require_bnb(test_case):
-    """
-    Decorator marking a test that requires bitsandbytes
-    """
-    if not is_bnb_available():
-        return unittest.skip("test requires bitsandbytes from https://github.com/facebookresearch/bitsandbytes")(
-            test_case
-        )
-    else:
-        return test_case
-
-
-def require_bnb_non_decorator():
-    """
-    Non-Decorator function that would skip a test if bitsandbytes is missing
-    """
-    if not is_bnb_available():
-        raise SkipTest("Test requires bitsandbytes from https://github.com/facebookresearch/bitsandbytes")
-
-
-def set_seed(seed: int = 42):
-    """
-    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch``
-
-    Args:
-        seed (:obj:`int`): The seed to set.
-    """
-    random.seed(seed)
-    np.random.seed(seed)
-    if is_torch_available():
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
-        # ^^ safe to call this function even if cuda is not available
-
-
-def get_gpu_count():
-    """
-    Return the number of available gpus (regardless of whether torch or tf is used)
-    """
-    if is_torch_available():
-        import torch
-
-        return torch.cuda.device_count()
-    else:
-        return 0
-
-
-def torch_assert_equal(actual, expected, **kwargs):
-    """
-    compare two tensors or non-tensor numbers for their equality
-    """
-    # assert_close was added around pt-1.9, it does better checks - e.g will check dimensions match
-    return torch.testing.assert_close(actual, expected, rtol=0.0, atol=0.0, **kwargs)
-
-
-def torch_assert_close(actual, expected, **kwargs):
-    """
-    compare two tensors or non-tensor numbers for their closeness.
-    """
-    # assert_close was added around pt-1.9, it does better checks - e.g will check dimensions match
-    return torch.testing.assert_close(actual, expected, **kwargs)
-
-
-def is_torch_bf16_available():
-    # from https://github.com/huggingface/transformers/blob/26eb566e43148c80d0ea098c76c3d128c0281c16/src/transformers/file_utils.py#L301
-    if is_torch_available():
-        import torch
-
-        if not torch.cuda.is_available() or torch.version.cuda is None:
-            return False
-        if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
-            return False
-        if int(torch.version.cuda.split(".")[0]) < 11:
-            return False
-        if not version.parse(torch.__version__) >= version.parse("1.09"):
-            return False
-        return True
-    else:
-        return False
-
-
-def require_torch_bf16(test_case):
-    """Decorator marking a test that requires CUDA hardware supporting bf16 and PyTorch >= 1.9."""
-    if not is_torch_bf16_available():
-        return unittest.skip("test requires CUDA hardware supporting bf16 and PyTorch >= 1.9")(test_case)
-    else:
-        return test_case
-
-
-def get_tests_dir(append_path=None):
-    """
-    Args:
-        append_path: optional path to append to the tests dir path
-
-    Return:
-        The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is
-        joined after the `tests` dir the former is provided.
-
-    """
-    # this function caller's __file__
-    caller__file__ = inspect.stack()[1][1]
-    tests_dir = os.path.abspath(os.path.dirname(caller__file__))
-    if append_path:
-        return os.path.join(tests_dir, append_path)
-    else:
-        return tests_dir
-
-
-def parameterized_custom_name_func_join_params(func, param_num, param):
-    """
-    customize the test name generator function as we want all params to appear in the sub-test
-    name, as by default it shows only the first param or for multiple params it just uses a unique sequence of ids and no params at all.
-
-    Usage:
-
-    @parameterized.expand(
-        [
-            (0, True),
-            (0, False),
-            (1, True),
-        ],
-        name_func=parameterized_custom_name_func_join_params,
-    )
-    def test_determinism_wrt_rank(self, num_workers, pad_dataset):
-
-    which gives:
-
-    test_determinism_wrt_rank_0_true
-    test_determinism_wrt_rank_0_false
-    test_determinism_wrt_rank_1_true
-
-    """
-    param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args))
-    return f"{func.__name__}_{param_based_name}"
-
-
-#
-# Helper functions for dealing with testing text outputs
-# The original code came from:
-# https://github.com/fastai/fastai/blob/master/tests/utils/text.py
-
-
-# When any function contains print() calls that get overwritten, like progress bars,
-# a special care needs to be applied, since under pytest -s captured output (capsys
-# or contextlib.redirect_stdout) contains any temporary printed strings, followed by
-# \r's. This helper function ensures that the buffer will contain the same output
-# with and without -s in pytest, by turning:
-# foo bar\r tar mar\r final message
-# into:
-# final message
-# it can handle a single string or a multiline buffer
-def apply_print_resets(buf):
-    return re.sub(r"^.*\r", "", buf, 0, re.M)
-
-
-def assert_screenout(out, what):
-    out_pr = apply_print_resets(out).lower()
-    match_str = out_pr.find(what.lower())
-    assert match_str != -1, f"expecting to find {what} in output: f{out_pr}"
-
-
-class CaptureStd:
-    """
-    Context manager to capture:
-
-    - stdout: replay it, clean it up and make it available via ``obj.out``
-    - stderr: replay it and make it available via ``obj.err``
-    - combined: combined the chosen streams and make it available via ``obj.combined``
-
-    init arguments:
-
-    - out - capture stdout:`` True``/``False``, default ``True``
-    - err - capture stdout: ``True``/``False``, default ``True``
-    - replay - whether to replay or not: ``True``/``False``, default ``True``. By default each
-    captured stream gets replayed back on context's exit, so that one can see what the test was
-    doing. If this is a not wanted behavior and the captured data shouldn't be replayed, pass
-    ``replay=False`` to disable this feature.
-
-    Examples::
-
-        # to capture stdout only with auto-replay
-        with CaptureStdout() as cs:
-            print("Secret message")
-        assert "message" in cs.out
-
-        # to capture stderr only with auto-replay
-        import sys
-        with CaptureStderr() as cs:
-            print("Warning: ", file=sys.stderr)
-        assert "Warning" in cs.err
-
-        # to capture both streams with auto-replay
-        with CaptureStd() as cs:
-            print("Secret message")
-            print("Warning: ", file=sys.stderr)
-        assert "message" in cs.out
-        assert "Warning" in cs.err
-
-        # to capture just one of the streams, and not the other, with auto-replay
-        with CaptureStd(err=False) as cs:
-            print("Secret message")
-        assert "message" in cs.out
-        # but best use the stream-specific subclasses
-
-        # to capture without auto-replay
-        with CaptureStd(replay=False) as cs:
-            print("Secret message")
-        assert "message" in cs.out
-
-        # sometimes it's easier to not try to figure out if it's stdout or stderr, and yet at
-        # other times the software may send the same output to stderr or stdout depending on
-        # environment, so to make the test robust a combined entry of both streams is available
-
-    """
-
-    def __init__(self, out=True, err=True, replay=True):
-        self.replay = replay
-
-        if out:
-            self.out_buf = StringIO()
-            self.out = "error: CaptureStd context is unfinished yet, called too early"
-        else:
-            self.out_buf = None
-            self.out = "not capturing stdout"
-
-        if err:
-            self.err_buf = StringIO()
-            self.err = "error: CaptureStd context is unfinished yet, called too early"
-        else:
-            self.err_buf = None
-            self.err = "not capturing stderr"
-
-            self.combined = "error: CaptureStd context is unfinished yet, called too early"
-
-    def __enter__(self):
-        if self.out_buf is not None:
-            self.out_old = sys.stdout
-            sys.stdout = self.out_buf
-
-        if self.err_buf is not None:
-            self.err_old = sys.stderr
-            sys.stderr = self.err_buf
-
-        self.combined = ""
-
-        return self
-
-    def __exit__(self, *exc):
-        if self.out_buf is not None:
-            sys.stdout = self.out_old
-            captured = self.out_buf.getvalue()
-            if self.replay:
-                sys.stdout.write(captured)
-            self.out = apply_print_resets(captured)
-            self.combined += self.out
-
-        if self.err_buf is not None:
-            sys.stderr = self.err_old
-            captured = self.err_buf.getvalue()
-            if self.replay:
-                sys.stderr.write(captured)
-            self.err = captured
-            self.combined += self.err
-
-    def __repr__(self):
-        msg = ""
-        if self.out_buf:
-            msg += f"stdout: {self.out}\n"
-        if self.err_buf:
-            msg += f"stderr: {self.err}\n"
-        return msg
-
-
-# in tests it's the best to capture only the stream that's wanted, otherwise
-# it's easy to miss things, so unless you need to capture both streams, use the
-# subclasses below (less typing). Or alternatively, configure `CaptureStd` to
-# disable the stream you don't need to test.
-
-
-class CaptureStdout(CaptureStd):
-    """Same as CaptureStd but captures only stdout"""
-
-    def __init__(self, replay=True):
-        super().__init__(err=False, replay=replay)
-
-
-class CaptureStderr(CaptureStd):
-    """Same as CaptureStd but captures only stderr"""
-
-    def __init__(self, replay=True):
-        super().__init__(out=False, replay=replay)
-
-
-class CaptureLogger:
-    """
-    Context manager to capture `logging` streams
-
-    Args:
-
-    - logger: 'logging` logger object
-
-    Results:
-        The captured output is available via `self.out`
-
-    Example::
-
-        >>> from transformers import logging
-        >>> from transformers.testing_utils import CaptureLogger
-
-        >>> msg = "Testing 1, 2, 3"
-        >>> logging.set_verbosity_info()
-        >>> logger = logging.get_logger("transformers.models.bart.tokenization_bart")
-        >>> with CaptureLogger(logger) as cl:
-        ...     logger.info(msg)
-        >>> assert cl.out, msg+"\n"
-    """
-
-    def __init__(self, logger):
-        self.logger = logger
-        self.io = StringIO()
-        self.sh = logging.StreamHandler(self.io)
-        self.out = ""
-
-    def __enter__(self):
-        self.logger.addHandler(self.sh)
-        return self
-
-    def __exit__(self, *exc):
-        self.logger.removeHandler(self.sh)
-        self.out = self.io.getvalue()
-
-    def __repr__(self):
-        return f"captured: {self.out}\n"
-
-
-@contextlib.contextmanager
-# adapted from https://stackoverflow.com/a/64789046/9201239
-def ExtendSysPath(path: Union[str, os.PathLike]) -> Iterator[None]:
-    """
-    Temporary add given path to `sys.path`.
-
-    Usage ::
-
-       with ExtendSysPath('/path/to/dir'):
-           mymodule = importlib.import_module('mymodule')
-
-    """
-
-    path = os.fspath(path)
-    try:
-        sys.path.insert(0, path)
-        yield
-    finally:
-        sys.path.remove(path)
-
-
-class TestCasePlus(unittest.TestCase):
-    """This class extends `unittest.TestCase` with additional features.
-
-    Feature 1: A set of fully resolved important file and dir path accessors.
-
-    In tests often we need to know where things are relative to the current test file, and it's not trivial since the
-    test could be invoked from more than one directory or could reside in sub-directories with different depths. This
-    class solves this problem by sorting out all the basic paths and provides easy accessors to them:
-
-    * ``pathlib`` objects (all fully resolved):
-
-       - ``test_file_path`` - the current test file path (=``__file__``)
-       - ``test_file_dir`` - the directory containing the current test file
-       - ``tests_dir`` - the directory of the ``tests`` test suite
-       - ``data_dir`` - the directory of the ``tests/data`` test suite
-       - ``repo_root_dir`` - the directory of the repository
-       - ``src_dir`` - the directory where the ``m4`` sub-dir resides (same as repo_root_dir in this case)
-
-    * stringified paths---same as above but these return paths as strings, rather than ``pathlib`` objects:
-
-       - ``test_file_path_str``
-       - ``test_file_dir_str``
-       - ``tests_dir_str``
-       - ``data_dir_str``
-       - ``repo_root_dir_str``
-       - ``src_dir_str``
-
-    Feature 2: Flexible auto-removable temporary dirs which are guaranteed to get removed at the end of test.
-
-    1. Create a unique temporary dir:
-
-    ::
-
-        def test_whatever(self):
-            tmp_dir = self.get_auto_remove_tmp_dir()
-
-    ``tmp_dir`` will contain the pathlib path to the created temporary dir. It will be automatically
-    removed at the end of the test.
-
-
-    2. Create a temporary dir of my choice, ensure it's empty before the test starts and don't
-    empty it after the test.
-
-    ::
-
-        def test_whatever(self):
-            tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
-
-    This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests
-    didn't leave any data in there.
-
-    3. You can override the first two options by directly overriding the ``before`` and ``after`` args, leading to the
-       following behavior:
-
-    ``before=True``: the temporary dir will always be cleared at the beginning of the test.
-
-    ``before=False``: if the temporary dir already existed, any existing files will remain there.
-
-    ``after=True``: the temporary dir will always be deleted at the end of the test.
-
-    ``after=False``: the temporary dir will always be left intact at the end of the test.
-
-    Use `self.get_auto_remove_tmp_dir_str()` instead if you want the returned value to be a non-pathlib version.
-
-    Note 1: In order to run the equivalent of ``rm -r`` safely, only subdirs of the project repository checkout are
-    allowed if an explicit ``tmp_dir`` is used, so that by mistake no ``/tmp`` or similar important part of the
-    filesystem will get nuked. i.e. please always pass paths that start with ``./``
-
-    Note 2: Each test can register multiple temporary dirs and they all will get auto-removed, unless requested
-    otherwise.
-
-    Feature 3: Get a copy of the ``os.environ`` object that sets up ``PYTHONPATH`` specific to the current test suite.
-    This is useful for invoking external programs from the test suite - e.g. distributed training.
-
-
-    ::
-        def test_whatever(self):
-            env = self.get_env()
-
-    """
-
-    def setUp(self):
-        # get_auto_remove_tmp_dir feature:
-        self.teardown_tmp_dirs = []
-
-        # figure out the resolved paths for repo_root, tests,  etc.
-        self._test_file_path = inspect.getfile(self.__class__)
-        path = Path(self._test_file_path).resolve()
-        self._test_file_dir = path.parents[0]
-        for up in [1, 2, 3]:
-            tmp_dir = path.parents[up]
-            if (tmp_dir / "m4").is_dir() and (tmp_dir / "tests").is_dir():
-                break
-        if tmp_dir:
-            self._repo_root_dir = tmp_dir
-        else:
-            raise ValueError(f"can't figure out the root of the repo from {self._test_file_path}")
-        self._tests_dir = self._repo_root_dir / "tests"
-        self._data_dir = self._repo_root_dir / "tests" / "test_data"
-        self._src_dir = self._repo_root_dir  # m4 doesn't use "src/" prefix in the repo
-
-    @property
-    def test_file_path(self):
-        return self._test_file_path
-
-    @property
-    def test_file_path_str(self):
-        return str(self._test_file_path)
-
-    @property
-    def test_file_dir(self):
-        return self._test_file_dir
-
-    @property
-    def test_file_dir_str(self):
-        return str(self._test_file_dir)
-
-    @property
-    def tests_dir(self):
-        return self._tests_dir
-
-    @property
-    def tests_dir_str(self):
-        return str(self._tests_dir)
-
-    @property
-    def data_dir(self):
-        return self._data_dir
-
-    @property
-    def data_dir_str(self):
-        return str(self._data_dir)
-
-    @property
-    def repo_root_dir(self):
-        return self._repo_root_dir
-
-    @property
-    def repo_root_dir_str(self):
-        return str(self._repo_root_dir)
-
-    @property
-    def src_dir(self):
-        return self._src_dir
-
-    @property
-    def src_dir_str(self):
-        return str(self._src_dir)
-
-    def get_env(self):
-        """
-        Return a copy of the ``os.environ`` object that sets up ``PYTHONPATH`` correctly. This is useful
-        for invoking external programs from the test suite - e.g. distributed training.
-
-        It always inserts ``.`` first, then ``./tests`` depending on the test suite type and
-        finally the preset ``PYTHONPATH`` if any (all full resolved paths).
-
-        """
-        env = os.environ.copy()
-        paths = [self.src_dir_str]
-        paths.append(self.tests_dir_str)
-        paths.append(env.get("PYTHONPATH", ""))
-
-        env["PYTHONPATH"] = ":".join(paths)
-        return env
-
-    def get_auto_remove_tmp_dir(self, tmp_dir=None, before=None, after=None):
-        """
-        Args:
-            tmp_dir (:obj:`string`, `optional`):
-                if :obj:`None`:
-
-                   - a unique temporary path will be created
-                   - sets ``before=True`` if ``before`` is :obj:`None`
-                   - sets ``after=True`` if ``after`` is :obj:`None`
-                else:
-
-                   - :obj:`tmp_dir` will be created
-                   - sets ``before=True`` if ``before`` is :obj:`None`
-                   - sets ``after=False`` if ``after`` is :obj:`None`
-            before (:obj:`bool`, `optional`):
-                If :obj:`True` and the :obj:`tmp_dir` already exists, make sure to empty it right away if :obj:`False`
-                and the :obj:`tmp_dir` already exists, any existing files will remain there.
-            after (:obj:`bool`, `optional`):
-                If :obj:`True`, delete the :obj:`tmp_dir` at the end of the test if :obj:`False`, leave the
-                :obj:`tmp_dir` and its contents intact at the end of the test.
-
-        Returns:
-            tmp_dir(:obj:`string`): either the same value as passed via `tmp_dir` or the path to the auto-selected tmp
-            dir
-        """
-        if tmp_dir is not None:
-            # defining the most likely desired behavior for when a custom path is provided.
-            # this most likely indicates the debug mode where we want an easily locatable dir that:
-            # 1. gets cleared out before the test (if it already exists)
-            # 2. is left intact after the test
-            if before is None:
-                before = True
-            if after is None:
-                after = False
-
-            # to avoid nuking parts of the filesystem, only relative paths are allowed
-            if not tmp_dir.startswith("./"):
-                raise ValueError(
-                    f"`tmp_dir` can only be a relative path, i.e. `./some/path`, but received `{tmp_dir}`"
-                )
-
-            # using provided path
-            tmp_dir = Path(tmp_dir).resolve()
-
-            # ensure the dir is empty to start with
-            if before is True and tmp_dir.exists():
-                shutil.rmtree(tmp_dir, ignore_errors=True)
-
-            tmp_dir.mkdir(parents=True, exist_ok=True)
-
-        else:
-            # defining the most likely desired behavior for when a unique tmp path is auto generated
-            # (not a debug mode), here we require a unique tmp dir that:
-            # 1. is empty before the test (it will be empty in this situation anyway)
-            # 2. gets fully removed after the test
-            if before is None:
-                before = True
-            if after is None:
-                after = True
-
-            # using unique tmp dir (always empty, regardless of `before`)
-            tmp_dir = Path(tempfile.mkdtemp())
-
-        if after is True:
-            # register for deletion
-            self.teardown_tmp_dirs.append(tmp_dir)
-
-        return tmp_dir
-
-    def get_auto_remove_tmp_dir_str(self, *args, **kwargs):
-        return str(self.get_auto_remove_tmp_dir(*args, **kwargs))
-
-    def tearDown(self):
-        # get_auto_remove_tmp_dir feature: remove registered temp dirs
-        for path in self.teardown_tmp_dirs:
-            shutil.rmtree(path, ignore_errors=True)
-        self.teardown_tmp_dirs = []
-
-
-def mockenv(**kwargs):
-    """
-    this is a convenience wrapper, that allows this ::
-
-    @mockenv(RUN_SLOW=True, USE_TF=False)
-    def test_something():
-        run_slow = os.getenv("RUN_SLOW", False)
-        use_tf = os.getenv("USE_TF", False)
-
-    """
-    return mock.patch.dict(os.environ, kwargs)
-
-
-# from https://stackoverflow.com/a/34333710/9201239
-@contextlib.contextmanager
-def mockenv_context(*remove, **update):
-    """
-    Temporarily updates the ``os.environ`` dictionary in-place. Similar to mockenv
-
-    The ``os.environ`` dictionary is updated in-place so that the modification is sure to work in all situations.
-
-    Args:
-      remove: Environment variables to remove.
-      update: Dictionary of environment variables and values to add/update.
-    """
-    env = os.environ
-    update = update or {}
-    remove = remove or []
-
-    # List of environment variables being updated or removed.
-    stomped = (set(update.keys()) | set(remove)) & set(env.keys())
-    # Environment variables and values to restore on exit.
-    update_after = {k: env[k] for k in stomped}
-    # Environment variables and values to remove on exit.
-    remove_after = frozenset(k for k in update if k not in env)
-
-    try:
-        env.update(update)
-        [env.pop(k, None) for k in remove]
-        yield
-    finally:
-        env.update(update_after)
-        [env.pop(k) for k in remove_after]
-
-
-# --- test network helper functions --- #
-
-
-def get_xdist_worker_id():
-    """
-    when run under pytest-xdist returns the worker id (int), otherwise returns 0
-    """
-    worker_id_string = os.environ.get("PYTEST_XDIST_WORKER", "gw0")
-    return int(worker_id_string[2:])  # strip "gw"
-
-
-DEFAULT_MASTER_PORT = 10999
-
-
-def get_unique_port_number():
-    """
-    When the test suite runs under pytest-xdist we need to make sure that concurrent tests won't use
-    the same port number. We can accomplish that by using the same base and always adding the xdist
-    worker id to it, or 0 if not running under pytest-xdist
-    """
-    return DEFAULT_MASTER_PORT + get_xdist_worker_id()
-
-
-# --- test IO helper functions --- #
-
-
-def write_file(file, content):
-    with open(file, "w") as f:
-        f.write(content)
-
-
-def read_json_file(file):
-    with open(file, "r") as fh:
-        return json.load(fh)
-
-
-def replace_str_in_file(file, text_to_search, replacement_text):
-    file = Path(file)
-    text = file.read_text()
-    text = text.replace(text_to_search, replacement_text)
-    file.write_text(text)
-
-
-# --- pytest conf functions --- #
-
-# to avoid multiple invocation from tests/conftest.py and examples/conftest.py - make sure it's called only once
-pytest_opt_registered = {}
-
-
-def pytest_addoption_shared(parser):
-    """
-    This function is to be called from `conftest.py` via `pytest_addoption` wrapper that has to be defined there.
-
-    It allows loading both `conftest.py` files at once without causing a failure due to adding the same `pytest`
-    option.
-
-    """
-    option = "--make-reports"
-    if option not in pytest_opt_registered:
-        parser.addoption(
-            option,
-            action="store",
-            default=False,
-            help="generate report files. The value of this option is used as a prefix to report names",
-        )
-        pytest_opt_registered[option] = 1
-
-
-def pytest_terminal_summary_main(tr, id):
-    """
-    Generate multiple reports at the end of test suite run - each report goes into a dedicated file in the current
-    directory. The report files are prefixed with the test suite name.
-
-    This function emulates --duration and -rA pytest arguments.
-
-    This function is to be called from `conftest.py` via `pytest_terminal_summary` wrapper that has to be defined
-    there.
-
-    Args:
-    - tr: `terminalreporter` passed from `conftest.py`
-    - id: unique id like `tests` or `examples` that will be incorporated into the final reports filenames - this is
-      needed as some jobs have multiple runs of pytest, so we can't have them overwrite each other.
-
-    NB: this functions taps into a private _pytest API and while unlikely, it could break should pytest do internal
-    changes - also it calls default internal methods of terminalreporter which can be hijacked by various `pytest-`
-    plugins and interfere.
-
-    """
-    from _pytest.config import create_terminal_writer
-
-    if not len(id):
-        id = "tests"
-
-    config = tr.config
-    orig_writer = config.get_terminal_writer()
-    orig_tbstyle = config.option.tbstyle
-    orig_reportchars = tr.reportchars
-
-    dir = f"reports/{id}"
-    Path(dir).mkdir(parents=True, exist_ok=True)
-    report_files = {
-        k: f"{dir}/{k}.txt"
-        for k in [
-            "durations",
-            "errors",
-            "failures_long",
-            "failures_short",
-            "failures_line",
-            "passes",
-            "stats",
-            "summary_short",
-            "warnings",
-        ]
-    }
-
-    # custom durations report
-    # note: there is no need to call pytest --durations=XX to get this separate report
-    # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/runner.py#L66
-    dlist = []
-    for replist in tr.stats.values():
-        for rep in replist:
-            if hasattr(rep, "duration"):
-                dlist.append(rep)
-    if dlist:
-        dlist.sort(key=lambda x: x.duration, reverse=True)
-        with open(report_files["durations"], "w") as f:
-            durations_min = 0.05  # sec
-            f.write("slowest durations\n")
-            for i, rep in enumerate(dlist):
-                if rep.duration < durations_min:
-                    f.write(f"{len(dlist)-i} durations < {durations_min} secs were omitted")
-                    break
-                f.write(f"{rep.duration:02.2f}s {rep.when:<8} {rep.nodeid}\n")
-
-    def summary_failures_short(tr):
-        # expecting that the reports were --tb=long (default) so we chop them off here to the last frame
-        reports = tr.getreports("failed")
-        if not reports:
-            return
-        tr.write_sep("=", "FAILURES SHORT STACK")
-        for rep in reports:
-            msg = tr._getfailureheadline(rep)
-            tr.write_sep("_", msg, red=True, bold=True)
-            # chop off the optional leading extra frames, leaving only the last one
-            longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, re.M | re.S)
-            tr._tw.line(longrepr)
-            # note: not printing out any rep.sections to keep the report short
-
-    # use ready-made report funcs, we are just hijacking the filehandle to log to a dedicated file each
-    # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/terminal.py#L814
-    # note: some pytest plugins may interfere by hijacking the default `terminalreporter` (e.g.
-    # pytest-instafail does that)
-
-    # report failures with line/short/long styles
-    config.option.tbstyle = "auto"  # full tb
-    with open(report_files["failures_long"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_failures()
-
-    # config.option.tbstyle = "short" # short tb
-    with open(report_files["failures_short"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        summary_failures_short(tr)
-
-    config.option.tbstyle = "line"  # one line per error
-    with open(report_files["failures_line"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_failures()
-
-    with open(report_files["errors"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_errors()
-
-    with open(report_files["warnings"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_warnings()  # normal warnings
-        tr.summary_warnings()  # final warnings
-
-    tr.reportchars = "wPpsxXEf"  # emulate -rA (used in summary_passes() and short_test_summary())
-
-    # Skip the `passes` report, as it starts to take more than 5 minutes, and sometimes it timeouts on CircleCI if it
-    # takes > 10 minutes (as this part doesn't generate any output on the terminal).
-    # (also, it seems there is no useful information in this report, and we rarely need to read it)
-    # with open(report_files["passes"], "w") as f:
-    #     tr._tw = create_terminal_writer(config, f)
-    #     tr.summary_passes()
-
-    with open(report_files["summary_short"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.short_test_summary()
-
-    with open(report_files["stats"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_stats()
-
-    # restore:
-    tr._tw = orig_writer
-    tr.reportchars = orig_reportchars
-    config.option.tbstyle = orig_tbstyle
-
-
-# --- distributed testing functions --- #
-
-
-class _RunOutput:
-    def __init__(self, returncode, stdout, stderr):
-        self.returncode = returncode
-        self.stdout = stdout
-        self.stderr = stderr
-
-
-async def _read_stream(stream, callback):
-    while True:
-        line = await stream.readline()
-        if line:
-            callback(line)
-        else:
-            break
-
-
-async def _stream_subprocess(cmd, env=None, stdin=None, timeout=None, quiet=False, echo=False) -> _RunOutput:
-    if echo:
-        print("\nRunning: ", " ".join(cmd))
-
-    p = await asyncio.create_subprocess_exec(
-        cmd[0],
-        *cmd[1:],
-        stdin=stdin,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-        env=env,
-    )
-
-    # note: there is a warning for a possible deadlock when using `wait` with huge amounts of data in the pipe
-    # https://docs.python.org/3/library/asyncio-subprocess.html#asyncio.asyncio.subprocess.Process.wait
-    #
-    # If it starts hanging, will need to switch to the following code. The problem is that no data
-    # will be seen until it's done and if it hangs for example there will be no debug info.
-    # out, err = await p.communicate()
-    # return _RunOutput(p.returncode, out, err)
-
-    out = []
-    err = []
-
-    def tee(line, sink, pipe, label=""):
-        line = line.decode("utf-8").rstrip()
-        sink.append(line)
-        if not quiet:
-            print(label, line, file=pipe)
-
-    # XXX: the timeout doesn't seem to make any difference here
-    await asyncio.wait(
-        [
-            _read_stream(p.stdout, lambda line: tee(line, out, sys.stdout, label="stdout:")),
-            _read_stream(p.stderr, lambda line: tee(line, err, sys.stderr, label="stderr:")),
-        ],
-        timeout=timeout,
-    )
-    return _RunOutput(await p.wait(), out, err)
-
-
-def execute_subprocess_async(cmd, env=None, stdin=None, timeout=180, quiet=False, echo=True) -> _RunOutput:
-    loop = asyncio.get_event_loop()
-    result = loop.run_until_complete(
-        _stream_subprocess(cmd, env=env, stdin=stdin, timeout=timeout, quiet=quiet, echo=echo)
-    )
-
-    cmd_str = " ".join(cmd)
-    if result.returncode > 0:
-        stderr = "\n".join(result.stderr)
-        raise RuntimeError(
-            f"'{cmd_str}' failed with returncode {result.returncode}\n\n"
-            f"The combined stderr from workers follows:\n{stderr}"
-        )
-
-    # check that the subprocess actually did run and produced some output, should the test rely on
-    # the remote side to do the testing
-    if not result.stdout and not result.stderr:
-        raise RuntimeError(f"'{cmd_str}' produced no output.")
-
-    return result
diff --git a/m4/training/__init__.py b/m4/training/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/m4/training/config.py b/m4/training/config.py
deleted file mode 100644
index e67a052aad257e634701793020fa283288155602..0000000000000000000000000000000000000000
--- a/m4/training/config.py
+++ /dev/null
@@ -1,545 +0,0 @@
-import json
-import logging
-import time
-from dataclasses import InitVar, asdict, dataclass
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-import git
-import yaml
-from simple_parsing import ArgumentParser, Serializable
-from simple_parsing.helpers import dict_field, list_field
-
-from m4.training.types import DatasetNames, DatasetTypes
-from m4.training.utils import FAKE_TOKEN_AROUND_IMAGE_V2, IMAGE_TOKEN, LoggingTypes
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class CfgFileConfig:
-    """Config file args"""
-
-    # path to config file
-    config: Optional[Path] = None
-    # set to false if you don't want to save config automatically
-    save_config: bool = True
-
-
-@dataclass
-class GlobalBatchSizeRampUp:
-    """These are init variables that are used to set up the GBS ramp up protocol"""
-
-    # global batch size ramp up protocol:
-    #
-    # 1. start with global batch size `start`
-    # 2. every time the number of `samples` is consumed increment global batch size by `increment`
-    # 3. repeat step 2 until global batch size reaches `finish`
-    start: Optional[int] = None
-    finish: Optional[int] = None
-    increment: Optional[int] = None
-    samples: Optional[int] = None
-
-
-@dataclass
-class GlobalBatchSizeRampUpRunningParams:
-    """The are running variables that are used to tell when to increment GBS and when to stop doing
-    that, they are never set directly in the config file, but are calculated when the training starts.
-    """
-
-    global_seen_samples: int = 0
-    global_batch_size_current: int = 0
-    next_goal_samples: int = 0
-    grad_acc_size_current: int = 1
-
-
-@dataclass
-class Hparams:
-    """General Hyperparameters"""
-
-    # --------------------
-    # General parameters
-    # --------------------
-
-    seed: int = 13
-    # If set to True, the sole purpose of the job is to pre-process the dataset (i.e. the map
-    # operations). The job will exit as soon as the dataset is pre-processed.
-    just_preprocess: bool = False
-    jz_job_time_sec: Optional[float] = None
-    jz_start_time: float = time.time()
-    job_id: Optional[int] = None
-    timeout: int = 1800  # 30 min
-    # set to False to ignore the optimizer states when loading from a checkpoint
-    load_optimizer_states: Optional[bool] = True
-    # set to False to disable this gpu memory saving method
-    gradient_checkpointing: Optional[bool] = True
-
-    # --------------------
-    # Model-related hparams
-    # --------------------
-    tokenizer_name: str = "gpt2"
-    # The value of the string will evaluated (i.e. interpreted) and must be a dict
-    tokenizer_params: str = '{"use_fast":True}'
-    tokenizer_add_tokens: str = (
-        f'[AddedToken("{FAKE_TOKEN_AROUND_IMAGE_V2}", rstrip=False, lstrip=False), AddedToken("{IMAGE_TOKEN}",'
-        " rstrip=False, lstrip=False)]"
-    )
-    # The value of the string will evaluated (i.e. interpreted). Unnecessary if tokenizer has a `pad_token`.
-    tokenizer_add_special_tokens: str = '{"pad_token": tokenizer.eos_token}'
-    model_name: str = "gpt2"
-    revision: str = "main"
-    model_params: Dict[str, Any] = dict_field(
-        dict(
-            vision_embed_dim=768,
-            vision_image_size=224,
-            vision_model_name="google/vit-base-patch16-224",
-            # The value of the string will evaluated (i.e. interpreted) and must be a dict
-            vision_model_params="{}",
-            # Ties the word embedding with LM head's weights
-            # Since word embedding is frozen, use in conjuncation with freeze_lm_head=True
-            tie_word_embeddings=False,
-            # Freeze different parts of the model
-            freeze_lm_head=False,
-            freeze_text_layers=True,
-            freeze_text_module_exceptions=[],
-            freeze_vision_layers=True,
-            freeze_vision_module_exceptions=[],
-            # Perceiver Resampler Parameters
-            use_resampler=False,
-            resampler_n_latents=64,
-            resampler_depth=6,
-            resampler_n_heads=16,
-            resampler_head_dim=96,
-        )
-    )
-
-    # --------------------
-    # Training parameters
-    # --------------------
-    resume_run: Optional[bool] = None
-    do_validation: bool = True
-
-    # deprecated in favor of batch_size_per_gpu
-    batch_size: Optional[int] = None
-    batch_size_per_gpu: int = 1
-    global_batch_size: Optional[int] = None
-
-    global_batch_size_ramp_up: GlobalBatchSizeRampUp = GlobalBatchSizeRampUp()
-    grad_acc_size: Optional[int] = 1
-
-    grad_clip: float = 1.0
-
-    # weights by which to multiply the loss of each dataset when accumulating gradients over datasets
-    loss_weights_per_dataset: Optional[List[float]] = None
-    # int(max_num_tokens / (batch_size * max_seq_len * grad_acc_size * num_processes))
-    max_num_opt_steps: Optional[int] = 500_000
-    max_num_opt_steps_this_run: Optional[int] = None
-    max_num_epochs: Optional[int] = None
-
-    # If the path appears the program will stop after finishing the current training step
-    kill_switch_path: Optional[Path] = None
-
-    # If the path appears the program will save a checkpoint and immediately delete this flag
-    save_switch_path: Optional[Path] = None
-
-    # --------------------
-    # Logging parameters
-    # --------------------
-    train_logging_opt_steps: int = 50
-    train_logging_per_dataset_suffix: str = ""
-
-    # If a specific logging type is specified, per dataset information will be inserted inside
-    # those logs.
-    train_logging_per_dataset_info: List[LoggingTypes] = list_field(LoggingTypes.JSONL, LoggingTypes.WANDB)
-
-    # If `train_logging_activations` is not empty, hooks will be inserted to the model to track
-    # the min/max/std/norm of the activations and weights. This will slow down training.
-    # See https://huggingface.co/docs/transformers/main/en/debugging#underflow-and-overflow-detection
-    train_logging_activations: List[LoggingTypes] = list_field()
-    train_logging_activations_opt_steps: Optional[int] = 25
-    train_logging_grad_param_deepspeed: List[LoggingTypes] = list_field()
-    train_logging_grad_param_deepspeed_opt_steps: int = 50
-    val_logging_opt_steps: int = train_logging_opt_steps * 5
-    val_inline_logging_opt_steps: int = train_logging_opt_steps
-    train_saving_opt_steps: int = train_logging_opt_steps * 5
-    save_dir: Optional[Path] = None
-    upload_to_s3: bool = False
-    train_log_mem_usage: bool = False
-    timing_break_down: bool = False
-
-    save_batch_max_idx: Optional[int] = None
-    save_batch_min_idx: Optional[int] = None
-
-    # ----------------------
-    # Wandb Parameters
-    # ----------------------
-    wandb_enable: bool = False
-    # name of the project
-    wandb_project: str = "VLOOM"
-    wandb_entity: str = "huggingfacem4"
-    # name of the wandb entity
-    wandb_log_freq: int = 50
-    wandb_run_id: str = ""
-    wandb_tags: Optional[List[str]] = None
-
-    repo_commit_id: Optional[str] = None
-
-    # ----------------------
-    # Debug Parameters
-    # ----------------------
-    use_torch_profiler: bool = False
-
-
-@dataclass
-class ResumeParams:
-    # ----------------------
-    # Resume run Parameters
-    # ----------------------
-    # Need to make sure that resume_run is True to give an input here
-    opt_step_dir: Optional[Path] = None
-    accelerator_state_dir: Optional[Path] = None
-    model_file: Optional[Path] = None
-    model_config_file: Optional[Path] = None
-    # Automatically resumes last run of the save_dir. Set to False to choose a specific run
-    resume_last: bool = True
-    train_logs: Dict = dict_field()
-    resume_opt_step: int = 0
-    resume_epoch: int = 0
-    resume_dataset_state: List = list_field()
-
-    gbs_running: GlobalBatchSizeRampUpRunningParams = GlobalBatchSizeRampUpRunningParams()
-
-
-@dataclass
-class DatasetParams:
-    # This always need to be specified as it is needed by dataset utils down the line
-    dataset_name: DatasetNames
-    # max number of images per sample
-    max_num_images: int = 5
-    # maximum sequence length
-    max_seq_len: int = 256
-    training_datasets_paths: List[Path] = list_field()
-    validation_datasets_paths: List[Path] = list_field()
-    # if True, instead of split and pack, each instance in sample will be
-    # either truncated or padded to the same length.
-    pad_dataset: bool = False
-    map_batch_size: int = 64
-    # Preprocessing number of processes in map (not useful for processing on the fly)
-    map_num_proc: Optional[int] = None
-    # Decides how many number of samples/subsequence should be extracted from the
-    # CM4 corpus when the dataset is to be padded irrelavent otherwise as full packing
-    # is used
-    max_num_samples_per_document: int = 10
-
-    # Strategy for detecting blur, laplacian or fft
-    blur_strategy: str = "fft"
-    # Threshold for blur detection, 0.0 means disabled. Set 32 for "laplacian" and
-    # 10 for "fft" for starters
-    blur_threshold: float = 0.0
-
-    add_begin_of_doc_token: bool = False
-    add_end_of_doc_token: bool = True
-
-    shuffle_after_packing: bool = False
-
-    # Parameters for T5 MLM
-    t5_mlm_noise_density: float = 0.15
-    t5_mlm_mean_noise_span_length: int = 3
-
-    dataset_type: Optional[DatasetTypes] = None
-
-    # Parameters for webdataset pipeline
-    shuffle_initial_urls_list: bool = False
-    shuffle_before_split_by_node_buffer_size: Optional[int] = None
-    shuffle_before_split_by_worker_buffer_size: Optional[int] = None
-    shuffle_after_tarfile_to_samples_buffer_size: Optional[int] = None
-    shuffle_after_batching_buffer_size: Optional[int] = None
-
-
-@dataclass
-class ImageCaptionPairedDatasetParams(DatasetParams):
-    # PMD only: This value decides the probability of the image token being at the start
-    # of the text or at the end of the text. Set to 0.5 for equal probability.
-    # Set to 0 for the image always at start.
-    prob_image_at_end: float = 0.5
-    # PMD only: Specifies the tolerance for the amount of padding in a sequence. If set
-    # to -1, then all padding will be tolerated. If set to 0, then no padding will be tolerated.
-    # Continuously increase this value to allow more padding in the sequence.
-    padding_tolerance: int = -1
-    dataset_type: DatasetTypes = DatasetTypes.IMAGE_CAPTION_PAIRS
-
-
-@dataclass
-class WebDocumentsDatasetParams(DatasetParams):
-    # Decide how often should the image attention mask is such that the
-    # the text attends to next image. Set to 0 for just perceding images
-    # NOTE: For PMD, this option doesn't apply anymore. Use `prob_image_at_end`
-    # to control the position of the image and corresponding image.
-    p_next: float = 0.5
-    dataset_type: DatasetTypes = DatasetTypes.WEB_DOCUMENTS
-
-
-@dataclass
-class DataParams(Serializable):
-    """Data Parameters"""
-
-    # what software to use for the dataset
-    use_webdataset: bool = False
-
-    # number of workers for dataloaders int
-    num_workers: int = 1
-    # allow async faster data transfer to GPUs (only make sense when CUDA GPUs are available)
-    # known to cause memory issues
-    pin_memory: bool = False
-    # Whether to use persistent workers for the dataloaders
-    persistent_workers: bool = True
-    realtime_processing: bool = False
-
-    train_seed: int = 1
-    val_seed: int = 2
-
-    # can use one config for both train + validation or specific ones if need to be different
-    select_n_examples: Optional[int] = None
-    select_n_examples_train: Optional[int] = None
-    select_n_examples_validation: Optional[int] = None
-
-    # TODO: Move to per dataset params as it makes more sense there
-    proba_interleaving_dataset: Optional[List[float]] = None
-
-    pmd: ImageCaptionPairedDatasetParams = ImageCaptionPairedDatasetParams(dataset_name=DatasetNames.PMD)
-    laion: ImageCaptionPairedDatasetParams = ImageCaptionPairedDatasetParams(dataset_name=DatasetNames.LAION)
-    cm4: WebDocumentsDatasetParams = WebDocumentsDatasetParams(dataset_name=DatasetNames.CM4)
-    wiki: WebDocumentsDatasetParams = WebDocumentsDatasetParams(dataset_name=DatasetNames.WIKI)
-
-
-@dataclass
-class OptimizerParams:
-    """Optimization parameters"""
-
-    # --------------------
-    # vl optim parameters
-    # --------------------
-    vl_optim: str = "AdamW"
-    vl_optim_params: Dict[str, Any] = dict_field(
-        dict(
-            # learning rate
-            lr=1e-4,
-            # betas for adam
-            betas=(0.9, 0.999),
-            weight_decay=0.1,
-            no_decay=["bias", "alpha", "layernorm", "ln", "layer_norm", "perceiver_resampler"],
-        )
-    )
-
-    vl_lr_scheduler: str = "get_constant_schedule_with_warmup"
-    # number of warmup steps for the learning rate
-    vl_lr_scheduler_params: Dict[str, Any] = dict_field(dict(num_warmup_steps=5_000, last_epoch=-1))
-    z_loss: float = 0.0
-
-
-@dataclass
-class Parameters(Serializable):
-    """base options."""
-
-    hparams: Hparams = Hparams()
-    optim_param: OptimizerParams = OptimizerParams()
-    data_param: DataParams = DataParams()
-    resume_param: ResumeParams = ResumeParams()
-    should_verify: InitVar[bool] = True
-
-    def verify(self, should_verify: bool):
-        if not should_verify:
-            return
-
-        dict_rep = vars(self)
-        expected = vars(self.__class__(should_verify=False))
-        for key, value in dict_rep.items():
-            if isinstance(value, dict):
-                diff = set(value.keys()) - set(asdict(expected[key]).keys())
-                raise TypeError(
-                    f"{key} in {self.__class__.__name__} has extra keys: {diff}. Please fix your config if you are"
-                    " using one."
-                )
-            if key not in expected:
-                raise ValueError(f"{key} is not a valid parameter for {self.__class__.__name__}")
-
-    def __post_init__(self, should_verify: bool = True):
-        """Post-initialization code"""
-        self.verify(should_verify=should_verify)
-
-        # copy select_n_examples to the more specific ones if the latter haven't been preset
-        if self.data_param.select_n_examples is not None:
-            if self.data_param.select_n_examples_train is None:
-                self.data_param.select_n_examples_train = self.data_param.select_n_examples
-            if self.data_param.select_n_examples_validation is None:
-                self.data_param.select_n_examples_validation = self.data_param.select_n_examples
-
-        # Get commit id
-        if self.hparams.repo_commit_id is None:
-            self.hparams.repo_commit_id = git.Repo(search_parent_directories=True).head.object.hexsha
-
-        # If processing on the fly, with the current implementation, we can't have `num_workers=0`
-        if self.data_param.realtime_processing and self.data_param.num_workers == 0:
-            raise ValueError(
-                "If doing processing on the fly (and thus using the `IterableDataset`), you can't have `num_workers`"
-                " equal to 0."
-            )
-
-        # batch_size deprecation
-        if self.hparams.batch_size is not None:
-            if self.hparams.batch_size_per_gpu > 1:
-                raise ValueError(
-                    "as hparams.batch_size is deprecated - don't know how to proceed with both hparams.batch_size>1"
-                    " and hparams.batch_size_per_gpu > 1"
-                )
-            else:
-                logger.warning(
-                    "will use the deprecated hparams.batch_size, but transition to hparams.batch_size_per_gpu instead"
-                )
-                self.hparams.batch_size_per_gpu = self.hparams.batch_size
-        self.hparams.batch_size = None
-
-        # Assign batch size to data_param as well for dataloaders
-        self.data_param.batch_size = self.hparams.batch_size_per_gpu
-
-        # note: all global batch_size-related configs including hparams.grad_acc_size will be
-        # checked/set in trainer's setup_batch_size_related_configs since we need to know the value
-        # of num_processes
-
-        # Assign loggingtypes given values
-        self.hparams.train_logging_activations = [LoggingTypes(val) for val in self.hparams.train_logging_activations]
-
-        # Check that proba_interleaving_dataset is mutually exclusive to loss_weights_per_dataset
-        if self.data_param.proba_interleaving_dataset and self.hparams.loss_weights_per_dataset:
-            raise ValueError(
-                "Can't have hparams.loss_weights_per_dataset and proba_interleaving_dataset. If we have"
-                " loss_weights_per_dataset, it means the gradients are accumulated over datasets. Therefore a batch of"
-                " each given at each update and there is no use of proba_interleaving_dataset"
-            )
-
-        if (
-            self.data_param.proba_interleaving_dataset is not None
-            and sum(self.data_param.proba_interleaving_dataset) != 1
-        ):
-            raise ValueError("proba_interleaving_dataset must sum to 1")
-
-        self.hparams.train_logging_grad_param_deepspeed = [
-            LoggingTypes(val) for val in self.hparams.train_logging_grad_param_deepspeed
-        ]
-
-        # Resume run if there is already an existing folder for this run
-        if self.hparams.save_dir is not None and self.hparams.save_dir.exists():
-            save_dir_has_checkpoints = (
-                len([dir for dir in self.hparams.save_dir.iterdir() if (dir.is_dir() and "opt_step" in str(dir))]) > 0
-            )
-            if self.hparams.resume_run is not None and not self.hparams.resume_run and save_dir_has_checkpoints:
-                logger.warning(
-                    "`resume_run` was explicitely set to False (i.e. starting from scratch), but the experiment"
-                    " folder already has been populated with previous runs.\nAlready saved checkpoints will be"
-                    " overwritten (at best, when `train_saving_opt_steps` is the same) or will be mixed with the new"
-                    " checkpoints of a potentially brand new experiment. Would it make sense to create a new"
-                    " `save_dir`?"
-                )
-            self.hparams.resume_run = save_dir_has_checkpoints
-
-        # Setup all args needed to resume a run
-        if self.hparams.resume_run:
-            # Get last step directory
-            if self.resume_param.opt_step_dir is None and not self.resume_param.resume_last:
-                raise ValueError(
-                    "`opt_step_dir` cannot be None while `resume_last` is False. Choose which dir you want to resume"
-                    " from..."
-                )
-            if self.resume_param.resume_last:
-                if self.resume_param.opt_step_dir is not None:
-                    raise ValueError(
-                        "`resume_last` cannot be True while `opt_step_dir` is not None. Choose which dir you want to"
-                        " resume from..."
-                    )
-                latest_path = self.hparams.save_dir / "latest_opt_step_dir"
-                with open(latest_path, "r") as fd:
-                    self.resume_param.opt_step_dir = Path(fd.read().strip())
-                if not (self.resume_param.opt_step_dir.exists() and self.resume_param.opt_step_dir.is_dir()):
-                    raise ValueError(
-                        f"It appears that the path in the `latest_opt_step_dir` file {latest_path} is invalid. It's"
-                        " either does not exist or is not a directory. Please fix that."
-                    )
-
-            with open(self.resume_param.opt_step_dir / "resume_run_infos.json", "r") as f:
-                resume_infos = json.load(f)
-            logger.info(f"Resuming from {self.resume_param.opt_step_dir}")
-            self.resume_param.accelerator_state_dir = self.resume_param.opt_step_dir / "accelerator_state"
-            self.resume_param.model_file = self.resume_param.opt_step_dir / "unwrapped_model"
-            self.resume_param.model_config_file = self.resume_param.opt_step_dir / "unwrapped_model/config.json"
-            self.resume_param.tokenizer = self.resume_param.opt_step_dir / "tokenizer"
-
-            self.resume_param.train_logs = resume_infos["train_logs"]
-            self.resume_param.resume_opt_step = resume_infos["resume_opt_step"]
-            self.resume_param.resume_epoch = resume_infos["resume_epoch"]
-            self.resume_param.resume_dataset_state = resume_infos.get("resume_dataset_state", list())
-
-            gbs_running = resume_infos["gbs_running"]
-            self.resume_param.gbs_running.global_batch_size_current = gbs_running["global_batch_size_current"]
-            self.resume_param.gbs_running.global_seen_samples = gbs_running["global_seen_samples"]
-            self.resume_param.gbs_running.next_goal_samples = gbs_running["next_goal_samples"]
-            self.resume_param.gbs_running.grad_acc_size_current = gbs_running["grad_acc_size_current"]
-
-            self.hparams.wandb_run_id = resume_infos["wandb_run_id"]
-            self.hparams.seed = resume_infos["seed"]
-
-            # Should not happen, but this is in case there is a run mixing
-            # wandb_enable = True and wandb_enable = False between jobs
-            if not self.hparams.wandb_enable:
-                self.hparams.wandb_run_id = ""
-
-    @classmethod
-    def parse(cls):
-        cfgfile_parser = ArgumentParser(add_help=False)
-        cfgfile_parser.add_arguments(CfgFileConfig, dest="cfgfile")
-        cfgfile_args, rest = cfgfile_parser.parse_known_args()
-
-        cfgfile: CfgFileConfig = cfgfile_args.cfgfile
-
-        file_config: Optional[Parameters] = None
-        if cfgfile.config is not None:
-            file_config = Parameters.load(cfgfile.config, load_fn=yaml.safe_load)
-
-        parser = ArgumentParser()
-
-        # add cfgfile args so they appear in the help message
-        parser.add_arguments(CfgFileConfig, dest="cfgfile")
-        parser.add_arguments(Parameters, dest="parameters", default=file_config)
-
-        # XXX: currently when called from tests we don't want to parse pytest arguments, so either
-        # this whole logic needs to be rewritten to not always call `parser.parse_args` but only
-        # when needed, for now as a workaround using `parse_known_args` and ignoring the args which
-        # don't belong to this program
-        args, unknown = parser.parse_known_args()
-
-        parameters: Parameters = args.parameters
-
-        parameters.save_config = cfgfile.save_config
-
-        return parameters
-
-    def save_config_state(self):
-        if self.save_config:
-            self.hparams.save_dir.mkdir(parents=True, exist_ok=True)
-            if self.hparams.job_id is not None:
-                config_file_name = f"{self.hparams.job_id}_config.yaml"
-            else:
-                config_file_name = "config.yaml"
-            self.save(self.hparams.save_dir / config_file_name, indent=4)
-
-
-def get_config(print_config: bool = True):
-    parameters: Parameters = Parameters.parse()
-    if print_config:
-        print(parameters)
-    return parameters
-
-
-if __name__ == "__main__":
-    config = get_config()
-    config.save_config_state()
diff --git a/m4/training/dataset_utils.py b/m4/training/dataset_utils.py
deleted file mode 100644
index 81e2037a68b39bcdc89dc8c0413135c95167cff9..0000000000000000000000000000000000000000
--- a/m4/training/dataset_utils.py
+++ /dev/null
@@ -1,352 +0,0 @@
-import logging
-import random
-
-import webdataset as wds
-from webdataset.tariterators import group_by_keys, tar_file_expander, url_opener
-
-from m4.training.types import DatasetTypes
-
-
-meta_prefix = "__"
-meta_suffix = "__"
-
-logger = logging.getLogger(__name__)
-trace = False
-
-
-def webdoc_valid_sample(sample):
-    """Check whether a sample is valid.
-
-    :param sample: sample to be checked
-    """
-    return (
-        sample is not None
-        and isinstance(sample, dict)
-        and len(list(sample.keys())) > 0
-        and not sample.get("__bad__", False)
-        and sample_has_all_files(sample)
-    )
-
-
-def sample_has_all_files(current_sample):
-    meta = current_sample.get("metadata.value", None)
-    if meta is None:
-        return False
-    meta = meta.decode("utf-8")
-    if len(meta) == 0:
-        return False
-    target_file_list = meta.split("\n")
-    fname_keys = [key for key in current_sample.keys() if key.endswith(".fname")]
-    fnames = [current_sample[key] for key in fname_keys]
-    check = all([fname in fnames for fname in target_file_list])
-    if not check:
-        return False
-    return True
-
-
-class ImageDecoder:
-    def __call__(self, bytes_):
-        import io
-
-        import PIL.Image
-
-        img = PIL.Image.open(io.BytesIO(bytes_))
-        img.load()
-        return img
-
-
-# Taken from https://github.com/mlfoundations/open_clip/blob/c48111dacac55db24878af229d8a5662c03e6f1c/src/training/data.py#L180-L183
-def log_and_continue(exn):
-    """Call in an exception handler to ignore any exception, issue a warning, and continue."""
-    logging.warning(f"Handling webdataset error ({repr(exn)}). Ignoring.")
-    return True
-
-
-# Adapt group_by_keys to our webdocument format in which each samples contains several text and image files
-# https://github.com/webdataset/webdataset/blob/039d74319ae55e5696dcef89829be9671802cf70/webdataset/tariterators.py#L195-L250
-def group_by_keys_interleaved(data, handler=log_and_continue):
-    """Return function over iterator that groups key, value pairs into samples."""
-    current_sample = None
-    for filesample in data:
-        try:
-            assert isinstance(filesample, dict)
-            fname, value = filesample["fname"], filesample["data"]
-            fname = fname.strip("./")
-            if fname.endswith(".metadata.txt"):
-                prefix, data_type, extension = fname.split(".")
-                suffix = data_type
-            else:
-                prefix, idx, data_type, extension = fname.split(".")
-                if data_type not in ["text", "image"]:
-                    raise ValueError(f"{fname}: unknown data type {data_type}")
-                suffix = idx
-            if trace:
-                print(
-                    f"prefix: {prefix}, idx: {idx}, data_type: {data_type}, extension: {extension}, keys:"
-                    f" {current_sample.keys() if isinstance(current_sample, dict) else None}"
-                )
-            if prefix is None:
-                continue
-            if current_sample is None or prefix != current_sample["__key__"]:
-                valid = webdoc_valid_sample(current_sample)
-                if valid:
-                    yield current_sample
-                elif current_sample is not None:
-                    logging.warning(f"{fname}: invalid sample {current_sample} ignored")
-                current_sample = dict(__key__=prefix, __url__=filesample["__url__"])
-            if suffix in current_sample:
-                raise ValueError(f"{fname}: duplicate file name in tar file {suffix} {current_sample.keys()}")
-            current_sample[f"{suffix}.value"] = value
-            current_sample[f"{suffix}.type"] = data_type
-            current_sample[f"{suffix}.fname"] = fname
-        except Exception as exn:
-            exn.args = exn.args + (filesample.get("stream"), filesample.get("url"))
-            if handler(exn):
-                continue
-            else:
-                break
-
-    if webdoc_valid_sample(current_sample):
-        yield current_sample
-
-
-def _tarfile_to_webdocument_samples(src, handler=log_and_continue):
-    streams = url_opener(src, handler=handler)
-    files = tar_file_expander(streams, handler=handler)
-    samples = group_by_keys_interleaved(files, handler=handler)
-    return samples
-
-
-tarfile_to_webdocument_samples = wds.filters.pipelinefilter(_tarfile_to_webdocument_samples)
-
-
-def _collate_texts_and_images_webdocument(data, handler=log_and_continue):
-    for sample in data:
-        try:
-            max_example_indices = max(
-                [int(key.split(".")[0]) for key in sample.keys() if key.endswith(".value") and key != "metadata.value"]
-            )
-            texts = [None for _ in range(max_example_indices + 1)]
-            images = [None for _ in range(max_example_indices + 1)]
-            for idx in range(max_example_indices + 1):
-                if f"{idx}.value" not in sample:
-                    continue
-                if "text" in sample[f"{idx}.type"]:
-                    texts[idx] = sample[f"{idx}.value"]
-                elif "image" in sample[f"{idx}.type"]:
-                    images[idx] = sample[f"{idx}.value"]
-                else:
-                    raise ValueError(f"Unknown data type: {sample[f'{idx}.type']}")
-            example = {"__key__": sample["__key__"], "__url__": sample["__url__"], "texts": texts, "images": images}
-            yield example
-        except Exception as exn:
-            exn.args = exn.args + (sample.get("stream"), sample.get("url"))
-            if handler(exn):
-                continue
-            else:
-                break
-
-
-collate_texts_and_images_webdocument = wds.filters.pipelinefilter(_collate_texts_and_images_webdocument)
-
-
-def _decode_image_and_text_webdocument(data, handler=log_and_continue):
-    image_decoder = ImageDecoder()
-    for sample in data:
-        try:
-            sample["images"] = [image_decoder(image) if image is not None else None for image in sample["images"]]
-            sample["texts"] = [text.decode("utf-8") if text is not None else None for text in sample["texts"]]
-            yield sample
-        except Exception as exn:
-            exn.args = exn.args + (sample.get("stream"), sample.get("url"))
-            if handler(exn):
-                continue
-            else:
-                break
-
-
-decode_image_and_text_webdocument = wds.filters.pipelinefilter(_decode_image_and_text_webdocument)
-
-
-def collate_dicts(samples):
-    keys = samples[0].keys()
-    batched_samples = {key: [sample[key] for sample in samples] for key in keys}
-    return batched_samples
-
-
-def get_webdocuments_webdataset(
-    urls,
-    batch_size,
-    shuffle_initial_urls_list=False,
-    shuffle_before_split_by_node_buffer_size=100,
-    shuffle_before_split_by_worker_buffer_size=100,
-    shuffle_after_tarfile_to_samples_buffer_size=100,
-    shuffle_after_batching_buffer_size=1000,
-):
-    if shuffle_initial_urls_list:
-        random.shuffle(urls)
-
-    pipeline_list = [wds.SimpleShardList(urls)]
-
-    if shuffle_before_split_by_node_buffer_size is not None:
-        pipeline_list.append(wds.shuffle(shuffle_before_split_by_node_buffer_size))
-
-    pipeline_list.append(wds.split_by_node)
-
-    if shuffle_before_split_by_worker_buffer_size is not None:
-        pipeline_list.append(wds.shuffle(shuffle_before_split_by_worker_buffer_size))
-
-    pipeline_list.extend(
-        [
-            wds.split_by_worker,
-            tarfile_to_webdocument_samples(),
-        ]
-    )
-
-    if shuffle_after_tarfile_to_samples_buffer_size is not None:
-        pipeline_list.append(wds.shuffle(shuffle_after_tarfile_to_samples_buffer_size))
-
-    pipeline_list.extend(
-        [
-            collate_texts_and_images_webdocument(),
-            decode_image_and_text_webdocument(),
-            wds.batched(batch_size, collation_fn=collate_dicts, partial=True),
-        ]
-    )
-
-    if shuffle_after_batching_buffer_size is not None:
-        pipeline_list.append(wds.shuffle(shuffle_after_batching_buffer_size))
-
-    dataset = wds.DataPipeline(pipeline_list)
-    return dataset
-
-
-def split_keep_2(x):
-    x = x.strip("./")
-    x_splitter = x.split(".")
-    return x_splitter[0], x_splitter[1]
-
-
-def _tarfile_to_pair_samples(src, handler=log_and_continue):
-    streams = url_opener(src, handler=handler)
-    files = tar_file_expander(streams, handler=handler)
-    samples = group_by_keys(files, keys=split_keep_2, handler=handler)
-    return samples
-
-
-tarfile_to_pair_samples = wds.filters.pipelinefilter(_tarfile_to_pair_samples)
-
-
-def _decode_image_and_text_pairs(data, handler=log_and_continue):
-    image_decoder = ImageDecoder()
-    for sample in data:
-        try:
-            sample["image"] = image_decoder(sample["image"])
-            sample["text"] = sample["text"].decode("utf-8")
-            yield sample
-        except Exception as exn:
-            exn.args = exn.args + (sample.get("stream"), sample.get("url"))
-            if handler(exn):
-                continue
-            else:
-                break
-
-
-decode_image_and_text_pairs = wds.filters.pipelinefilter(_decode_image_and_text_pairs)
-
-
-def get_image_caption_pairs_webdataset(
-    urls,
-    batch_size,
-    shuffle_initial_urls_list=False,
-    shuffle_before_split_by_node_buffer_size=100,
-    shuffle_before_split_by_worker_buffer_size=100,
-    shuffle_after_tarfile_to_samples_buffer_size=100,
-    shuffle_after_batching_buffer_size=1000,
-):
-    if shuffle_initial_urls_list:
-        random.shuffle(urls)
-
-    pipeline_list = [wds.SimpleShardList(urls)]
-
-    if shuffle_before_split_by_node_buffer_size is not None:
-        pipeline_list.append(wds.shuffle(shuffle_before_split_by_node_buffer_size))
-
-    pipeline_list.append(wds.split_by_node)
-
-    if shuffle_before_split_by_worker_buffer_size is not None:
-        pipeline_list.append(wds.shuffle(shuffle_before_split_by_worker_buffer_size))
-
-    pipeline_list.extend(
-        [
-            wds.split_by_worker,
-            tarfile_to_pair_samples(handler=log_and_continue),
-        ]
-    )
-
-    if shuffle_after_tarfile_to_samples_buffer_size is not None:
-        pipeline_list.append(wds.shuffle(shuffle_after_tarfile_to_samples_buffer_size))
-
-    pipeline_list.extend(
-        [
-            decode_image_and_text_pairs(),
-            wds.batched(batch_size, collation_fn=collate_dicts, partial=True),  # todo: check if partial is needed
-        ]
-    )
-
-    if shuffle_after_batching_buffer_size is not None:
-        pipeline_list.append(wds.shuffle(shuffle_after_batching_buffer_size))
-
-    dataset = wds.DataPipeline(pipeline_list)
-    return dataset
-
-
-def get_webdataset(
-    urls,
-    ds_type: DatasetTypes,
-    batch_size: int,
-    shuffle_initial_urls_list,
-    shuffle_before_split_by_node_buffer_size,
-    shuffle_before_split_by_worker_buffer_size,
-    shuffle_after_tarfile_to_samples_buffer_size,
-    shuffle_after_batching_buffer_size,
-):
-    if ds_type == DatasetTypes.WEB_DOCUMENTS:
-        return get_webdocuments_webdataset(
-            urls,
-            batch_size,
-            shuffle_initial_urls_list,
-            shuffle_before_split_by_node_buffer_size,
-            shuffle_before_split_by_worker_buffer_size,
-            shuffle_after_tarfile_to_samples_buffer_size,
-            shuffle_after_batching_buffer_size,
-        )
-    elif ds_type == DatasetTypes.IMAGE_CAPTION_PAIRS:
-        return get_image_caption_pairs_webdataset(
-            urls,
-            batch_size,
-            shuffle_initial_urls_list,
-            shuffle_before_split_by_node_buffer_size,
-            shuffle_before_split_by_worker_buffer_size,
-            shuffle_after_tarfile_to_samples_buffer_size,
-            shuffle_after_batching_buffer_size,
-        )
-    else:
-        raise ValueError(f"Unknown dataset type: {ds_type}")
-
-
-def check_webdataset_command(command):
-    if "s3:/" not in command:
-        return True
-
-    command = command.strip()
-    if not command.startswith("pipe:bash"):
-        return False
-
-    if not command.endswith(".tar"):
-        return False
-
-    if "get_file.sh" not in command:
-        return False
-
-    return True
diff --git a/m4/training/debug_utils.py b/m4/training/debug_utils.py
deleted file mode 100644
index e354f3d2dc6e5c1cc961913f6dd0f8c0a6996842..0000000000000000000000000000000000000000
--- a/m4/training/debug_utils.py
+++ /dev/null
@@ -1,34 +0,0 @@
-""" Trainer debug utils """
-
-
-def dump_optim_states(self):
-    """dumps basic information about the state of the optimizer"""
-
-    print("*** Optim States Dump:")
-    param_groups_cnt = len(self.vl_optim.param_groups)
-    # state dict has more than param_groups info, so extract only the param groups
-    param_group_states = list(self.vl_optim.state.values())[:param_groups_cnt]
-    for i, state in enumerate(param_group_states):
-        print(f"param group: {i}")
-        print(f"  step={state['step']}")
-        print(f"  exp_avg    all_zero={all(state['exp_avg'] == 0)}")
-        print(f"  exp_avg_sq all_zero={all(state['exp_avg_sq'] == 0)}")
-
-    # can also dump LR state if need be
-    # print(f"LR={self.vl_scheduler.get_last_lr()}")
-
-
-def validate_optim_states_are_reset(self):
-    """
-    for a new or fully reset optimizer we expect all zeros `exp_avg` and `exp_avg_sq` state tensors and step=1
-    """
-
-    param_groups_cnt = len(self.vl_optim.param_groups)
-    param_group_states = list(self.vl_optim.state.values())[:param_groups_cnt]
-    for i, state in enumerate(param_group_states):
-        if state["step"] != 1:
-            raise ValueError(f"optimizer reset didn't seem to work: state={i} step={state['step']}")
-        if not all(state["exp_avg"] == 0):
-            raise ValueError(f"optimizer reset didn't seem to work: state={i} step={state['exp_avg']}")
-        if not all(state["exp_avg_sq"] == 0):
-            raise ValueError(f"optimizer reset didn't seem to work: state={i} step={state['exp_avg_sq']}")
diff --git a/m4/training/packing.py b/m4/training/packing.py
deleted file mode 100644
index 8dfb5aa3b71c4196e25a37a9eb54273a4ac2a066..0000000000000000000000000000000000000000
--- a/m4/training/packing.py
+++ /dev/null
@@ -1,755 +0,0 @@
-import logging
-from bisect import bisect_left
-from collections import OrderedDict
-
-import cv2
-import numpy as np
-import torch
-
-from m4.training.utils import FAKE_TOKEN_AROUND_IMAGE_V2, IMAGE_TOKEN, _convert_to_rgb
-
-
-logger = logging.getLogger(__name__)
-
-
-# Hyper-parameters
-_IMAGE_BONUS_VALUE = 2  # The bonus value for tokens preceding the image token
-_MIN_LENGTH_DOCUMENTS_TO_PACK = (
-    5  # Minimum lengths of documents to pack together (lenghts is measures in number of tokens)
-)
-
-
-def incremental_to_binary_attention_mask(incremental_mask, num_classes=-1):
-    # This function converts: [-1, 0, 1] => [[0, 0], [1, 0], [0, 1]]
-
-    # If any of images index are more than num_classes, set them to -1.
-    # Words after the max number of images allowed have been seen don't attend on anything
-    if num_classes != -1:
-        incremental_mask[incremental_mask >= num_classes] = -1
-
-    negatives = incremental_mask == -1
-    incremental_mask[negatives] = 0
-    attn_mask = torch.nn.functional.one_hot(incremental_mask, num_classes=num_classes)
-    attn_mask[negatives, :] = 0
-    return attn_mask
-
-
-def image_attention_mask_for_packed_input_ids(input_ids, tokenizer):
-    image_attention_mask = torch.full_like(input_ids, fill_value=-1)
-    next_image_attention_mask = torch.full_like(input_ids, fill_value=-1)
-    image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
-    eod_token_id = tokenizer.eos_token_id
-    for batch_idx in range(input_ids.size(0)):
-        count = -1
-        seen_eod = False
-        for idx, token_id in enumerate(input_ids[batch_idx]):
-            if token_id == image_token_id:
-                count += 1
-                image_attention_mask[batch_idx][idx] = count
-                seen_eod = False
-            else:
-                image_attention_mask[batch_idx][idx] = count
-
-            if seen_eod:
-                image_attention_mask[batch_idx][idx] = -1
-
-            if token_id == eod_token_id:
-                seen_eod = True
-
-    for batch_idx in range(input_ids.size(0)):
-        count = -1
-        seen_eod = False
-        for idx in range(input_ids[batch_idx].size(0) - 1, -1, -1):
-            token_id = input_ids[batch_idx][idx]
-            if token_id == image_token_id:
-                count += 1
-                next_image_attention_mask[batch_idx][idx] = count
-                seen_eod = False
-            else:
-                next_image_attention_mask[batch_idx][idx] = count
-
-            if token_id == eod_token_id:
-                seen_eod = True
-
-            if seen_eod:
-                next_image_attention_mask[batch_idx][idx] = -1
-
-        non_negative_indices = next_image_attention_mask[batch_idx] != -1
-        next_image_attention_mask[batch_idx][non_negative_indices] -= count
-        next_image_attention_mask[batch_idx][non_negative_indices] *= -1
-
-    return image_attention_mask, next_image_attention_mask
-
-
-def laplacian_blur_detection(image, threshold=0.0):
-    # compute the Laplacian of the image and then return the focus
-    # measure, which is simply the variance of the Laplacian
-    if threshold == 0.0:
-        return False
-
-    image = np.array(image)
-
-    if len(image.shape) == 3 and image.shape[2] == 3:
-        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
-        return cv2.Laplacian(gray, cv2.CV_64F).var() < threshold
-    else:
-        # Don't remove grayscale images
-        return False
-
-
-def fft_blur_detection(image, size=50, threshold=0.0):
-    if threshold == 0.0:
-        return False
-    (h, w) = image.shape
-    (cX, cY) = (int(w / 2.0), int(h / 2.0))
-    fft = np.fft.fft2(image)
-    fftShift = np.fft.fftshift(fft)
-    fftShift[cY - size : cY + size, cX - size : cX + size] = 0
-    fftShift = np.fft.ifftshift(fftShift)
-    recon = np.fft.ifft2(fftShift)
-    magnitude = 20 * np.log(np.abs(recon))
-    mean = np.mean(magnitude)
-    return mean < threshold
-
-
-def split_pack_and_pad(
-    sample,
-    tokenizer,
-    max_seq_len,
-    image_transform,
-    max_num_images,
-    max_num_samples_per_document=10,
-    prefix_seed=(0, 0),
-    is_blurred_fn=None,
-    blur_threshold=0.0,
-    add_begin_of_doc_token=False,
-    add_end_of_doc_token=True,
-    max_num_images_per_document=None,
-):
-    """
-    Return a batch of samples in the format expected by the model which
-    includes `input_ids`, `pixel_values`, `attention_mask`, `image_attention_mask`,
-    and `next_image_attention_mask`. The `input_ids` are sampled from the document to
-    ensure it has `max_seq_len` tokens otherwise, the shorter documents are packed together.
-    For each document, we sample a maximum of `max_num_samples_per_document` or `max_num_samples_for_curr_document`
-    (where the latter is proportional to the length of the document and inversely proportional to the length of subsequences)
-    `input_ids` with sequence length `max_seq_len` from the document. This means that
-    each sample sampled can have different start index. Based on the start index of sample that
-    has been sampled, we also sample a maximum of `max_num_images` images from the document.
-    If there are less than `max_num_images` images in the document, we pad the images with zeros.
-    The start indexes are skewed towards subsequences that contain images.
-
-    Args:
-        sample (Dict): A sample object containing the document with images and text.
-        tokenizer (PretrainedTokenizer): Text tokenizer to be used.
-        max_seq_len (int): Maximum sequence length of the returned text tokens.
-        image_transform (Callable): Transform to be applied on the images
-        max_num_images (int): Maximum number of images to be sampled per sample. If less, they are padded with zeros.
-        max_num_samples_per_document (int, optional): Maximum number of samples per document to be sampled. Defaults to 10.
-        prefix_seed: Prefix seed sequence for "reproducible randomness" in calls to `np.random.choice`
-
-    Returns:
-        _type_: _description_
-    """
-    text_batch = sample["texts"]
-
-    image_batch = sample.get("image_embeddings", None)
-    is_raw_images = False
-    if image_batch is None:
-        image_batch = sample.get("images", None)
-        is_raw_images = True
-    if image_batch is None:
-        raise ValueError("Either image_embeddings or images must be present in the sample")
-
-    image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
-    last_was_image = False
-
-    if is_blurred_fn is None:
-        is_blurred_fn = fft_blur_detection
-
-    all_images = []
-    all_texts = []
-    for raw_images, raw_texts in zip(image_batch, text_batch):
-        # Filter ones that don't have either one image and one text word
-        if not any(raw_images) or not any(raw_texts):
-            continue
-
-        if max_num_images_per_document:
-            num_images = sum([1 if image is not None else 0 for image in raw_images])
-            if num_images > max_num_images_per_document:
-                continue
-
-        any_blurred = False
-
-        if is_raw_images and blur_threshold > 0.0:
-            for image in raw_images:
-                if image is not None:
-                    image = _convert_to_rgb(image)
-                    any_blurred = any_blurred or is_blurred_fn(image, threshold=blur_threshold)
-                    if any_blurred:
-                        break
-
-        if any_blurred:
-            continue
-
-        inds_of_texts_to_split = [
-            i
-            for i, text in enumerate(raw_texts)
-            if text is not None and isinstance(text, str) and "END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED" in text
-        ]
-        if inds_of_texts_to_split:
-            splitted_raw_images, splitted_raw_texts = [], []
-            previous_i = 0
-            for i in inds_of_texts_to_split:
-                splitting = raw_texts[i].split("END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED")
-                part1, part2 = splitting[0], splitting[-1]
-
-                sub_doc_images = raw_images[previous_i:i] + [None]
-                sub_doc_texts = raw_texts[previous_i:i] + [part1.strip()]
-                if not any(sub_doc_images):  # This can happen if all images in raw_images[0:i] are all None
-                    continue
-
-                splitted_raw_images.append(sub_doc_images)
-                splitted_raw_texts.append(sub_doc_texts)
-
-                if part2.strip() == "":
-                    previous_i = i + 1
-                else:
-                    raw_texts[i] = part2.strip()
-                    previous_i = i
-
-            if previous_i < len(raw_images) and any(raw_images[previous_i:]):
-                splitted_raw_images.append(raw_images[previous_i:])
-                splitted_raw_texts.append(raw_texts[previous_i:])
-
-        else:
-            splitted_raw_images, splitted_raw_texts = [raw_images], [raw_texts]
-
-        # Sanity check
-        if [len(ims) for ims in splitted_raw_images] != [len(txts) for txts in splitted_raw_texts]:
-            raise ValueError(
-                "Number of images and texts don't match after splitting on `END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED`."
-                " Something core went wrong during the splitting and needs to be fixed."
-            )
-
-        for s_r_ims, s_r_txts in zip(splitted_raw_images, splitted_raw_texts):
-            images, web_text = [], ""
-            for image, text in zip(s_r_ims, s_r_txts):
-                if text is None and image is None:
-                    continue
-
-                if image is not None:
-                    web_text += f"{FAKE_TOKEN_AROUND_IMAGE_V2}{IMAGE_TOKEN}"
-                    if is_raw_images:
-                        images.append(image_transform(image))
-                    else:
-                        images.append(torch.tensor(image))
-                    last_was_image = True
-                elif text is not None:
-                    if last_was_image:
-                        web_text += f"{FAKE_TOKEN_AROUND_IMAGE_V2}{text}"
-                        last_was_image = False
-                    else:
-                        web_text += f" {text}" if web_text != "" else text
-
-            if last_was_image:
-                web_text += f"{FAKE_TOKEN_AROUND_IMAGE_V2}"
-
-            web_text = web_text.strip(" ")
-
-            # This is mostly a sanity check. Cases like that should not happen at that point.
-            if web_text == "" or len(images) == 0:
-                continue
-
-            images = torch.stack(images)
-            all_images.append(images)
-
-            web_text_ids = tokenizer.encode(web_text, add_special_tokens=False)
-            if add_end_of_doc_token:
-                web_text_ids += [tokenizer.eos_token_id]
-
-            if add_begin_of_doc_token:
-                web_text_ids = [tokenizer.bos_token_id] + web_text_ids
-            all_texts.append(web_text_ids)
-
-    output_input_ids = []
-    output_images = []
-    output_attention_masks = []
-    output_num_images = []
-    output_num_text_tokens = []
-
-    input_ids_to_pack = []
-    images_to_pack = []
-    for images, text in zip(all_images, all_texts):
-        # We save all the documents which are shorter than the max_seq_len to pack them together.
-        if len(text) <= max_seq_len:
-            if len(text) < _MIN_LENGTH_DOCUMENTS_TO_PACK:  # Filter out extremely short sequences
-                continue
-            input_ids_to_pack.extend(text)
-            images_to_pack.extend(images)
-        else:
-            # Computing the bonus scores for tokens near images to skew the sampling towards them
-            # The main idea is to give a bonus to tokens that are closely before an image token, so that these tokens have more chance to be sampled.
-            # Bonuses are computed for each image, which means a given token can receive bonuses from multiple images if this token is closely preceding multiple images.
-            # We sum all the bonuses and L1 normalized along the seq_len axis to get a probability distribution.
-            # Each token start with a regular bonus of 1, which corresponds to the uniform distribution over the sequence when there are no bonuses added.
-
-            # Now the remaining question is which precedding tokens do we distribue bonuses to.
-            # We first observe that for the sampled sub-sequence to be considered valid (i.e. sub-sequence contains an image), the start index can only be among [image_idx - max_seq_len + 1, image_idx].
-            # For the sake of the explanation, let's split the [image_idx - max_seq_len + 1, image_idx] interval in 3 parts: left, middle and right (in increasing order).
-            # If we give bonuses to the tokens just before the image (right part), then we are favoring p_next=0 because only the tokens after the image have an image to attend to.
-            # In practice, images will tend to be at the beginning of the sampled sub-sequence.
-            # If we give bonuses very far before the image (left part), then we are favoring p_next=1 because only the tokens before the image gave an image to attend to.
-            # In practice, images will tend to be at the end of the sampled sub-sequence.
-            # To avoid choosing favoring p_next=0 or p_next=1, we can give bonuses to the tokens in the middle part.
-            # In practise, images will tend to be in the middle of the sampled sequence.
-
-            # Ultimately, we don't want to skew the distribution fed to model in that way (i.e. whether images are in the beginning, middle or end of the sampled sub-sequence),
-            # and have all these cases represented equally in the data. So the easiest is to distribute a bonus to all of the max_seq_len tokens preceding the image.
-            all_scores = np.array([1] * len(text))
-            for img_token_idx in np.where(np.array(text) == image_token_id)[0]:
-                all_scores[max(0, img_token_idx - max_seq_len) : img_token_idx + 1] += _IMAGE_BONUS_VALUE
-            # all_scores = np.clip(all_scores, a_min=1, a_max=3 * _IMAGE_BONUS_VALUE * max_num_images + 1) # We can optionally clip the bonuses to avoid having too high values (i.e. outliers documents)
-            all_scores = all_scores[:-_MIN_LENGTH_DOCUMENTS_TO_PACK]
-
-            # The number of samples is proportional to the length of the text and inversely proportional to the maximum sequence length
-            max_num_samples_for_curr_document = len(text) // max_seq_len
-            # Set "reproducible randomness" by creating an np.default_rng seeded by (main seed, epoch, rank_idx, worker_idx, mapped_batch_index, text len)
-            choices = np.random.default_rng(seed=list(prefix_seed) + [len(text)]).choice(
-                range(len(text) - _MIN_LENGTH_DOCUMENTS_TO_PACK),  # shorter sub-sequences are reserved for packing
-                min(
-                    len(text) - max_seq_len, 2 * max_num_samples_per_document
-                ),  # Sampling more than necessary and then breaking out of the for loop once we have enough samples
-                p=all_scores / np.linalg.norm(all_scores, ord=1),
-                replace=False,
-            )
-
-            nb_effective_sequences_out_of_sampling = 0
-            for start_index in choices:
-                image_start_index = text[:start_index].count(image_token_id)
-                text_sub_sequence = text[start_index : start_index + max_seq_len]
-                image_count = text_sub_sequence.count(image_token_id)
-                if image_count == 0:
-                    # Skip if there are no images in the sequence
-                    continue
-
-                if len(text_sub_sequence) < max_seq_len:
-                    # If the sub-sequence is shorter than max_seq_len, we reserve it for packing
-                    # It necessarily mean that the sub-sequence was sampled towards the end of the document,
-                    # which implies that we only need the `image_start_index` and not the `image_end_index`
-                    if text_sub_sequence.count(image_token_id) != len(images[image_start_index:]):
-                        # A safeguard for this
-                        logger.warning(
-                            "Skipping this sample because of mismatch in actual number of images and "
-                            "the '<image>' tokens in the text"
-                        )
-                        continue
-                    input_ids_to_pack.extend(text_sub_sequence)
-                    images_to_pack.extend(images[image_start_index:])
-                    continue
-
-                current_images = images[image_start_index : image_start_index + min(max_num_images, image_count)]
-                if len(current_images) != min(max_num_images, image_count):
-                    # A safeguard for something off about this document, maybe `<image>` tag that
-                    # by there from before or some issue in parsing the image?
-                    logger.warning(
-                        "Skipping this sample because of mismatch in actual number of images and "
-                        "the '<image>' tokens in the text"
-                    )
-                    break
-                padded_image_tensor = torch.zeros(max_num_images, *images.size()[1:])
-                padded_image_tensor[: min(max_num_images, image_count)] = current_images
-                output_images.append(padded_image_tensor)
-                output_num_images.append(min(max_num_images, image_count))
-
-                output_input_ids.append(torch.tensor(text_sub_sequence))
-                output_num_text_tokens.append(len(text_sub_sequence))
-
-                attention_mask = torch.ones((max_seq_len,), dtype=torch.long)
-                output_attention_masks.append(attention_mask)
-
-                nb_effective_sequences_out_of_sampling += 1
-                if nb_effective_sequences_out_of_sampling >= min(
-                    max_num_samples_for_curr_document, max_num_samples_per_document
-                ):
-                    # We got all the samples we need for this document, so breaking out
-                    break
-
-    # Pack the remaining sequences from `input_ids_to_pack` x `images_to_pack`
-    if input_ids_to_pack:
-        image_counter = 0
-        for i in range(0, len(input_ids_to_pack), max_seq_len):
-            current_input_ids = input_ids_to_pack[i : i + max_seq_len]
-            unpadded_seq_len = len(current_input_ids)
-            num_images = current_input_ids.count(image_token_id)
-            if num_images == 0:
-                continue
-            current_images = images_to_pack[image_counter : image_counter + num_images]
-            image_counter += num_images
-            if unpadded_seq_len < max_seq_len:
-                padded_input_ids = [tokenizer.pad_token_id] * max_seq_len
-                padded_input_ids[:unpadded_seq_len] = current_input_ids
-                current_input_ids = padded_input_ids
-            elif unpadded_seq_len > max_seq_len:
-                # This case has no purpose other than safeguard
-                continue
-            try:
-                current_images = torch.stack(current_images)[:max_num_images]
-            except Exception:
-                continue
-            padded_image_tensor = torch.zeros(max_num_images, *current_images.size()[1:])
-            padded_image_tensor[: current_images.size(0)] = current_images
-            attention_mask = torch.zeros((max_seq_len,), dtype=torch.long)
-            attention_mask[:unpadded_seq_len] = 1
-
-            output_images.append(padded_image_tensor)
-            output_input_ids.append(torch.tensor(current_input_ids))
-            output_num_text_tokens.append(unpadded_seq_len)
-            output_num_images.append(min(max_num_images, num_images))
-
-            output_attention_masks.append(attention_mask)
-
-    if len(output_images) == 0 or len(output_input_ids) == 0:
-        result = {
-            "input_ids": torch.tensor([], dtype=torch.long),
-            "attention_mask": torch.tensor([], dtype=torch.bool),
-            "image_attention_mask": torch.tensor([], dtype=torch.bool),
-            "next_image_attention_mask": torch.tensor([], dtype=torch.bool),
-            "num_images": torch.tensor([], dtype=torch.long),
-            "num_text_tokens": torch.tensor([], dtype=torch.long),
-        }
-        if is_raw_images:
-            result["pixel_values"] = torch.tensor([], dtype=torch.float32)
-        else:
-            result["image_embeddings"] = torch.tensor([], dtype=torch.float32)
-        return result
-
-    output_input_ids = torch.stack(output_input_ids)
-    output_images = torch.stack(output_images)
-    output_attention_masks = torch.stack(output_attention_masks)
-
-    image_attention_mask, next_image_attention_mask = image_attention_mask_for_packed_input_ids(
-        output_input_ids, tokenizer
-    )
-    image_attention_mask = incremental_to_binary_attention_mask(image_attention_mask, num_classes=max_num_images)
-    next_image_attention_mask = incremental_to_binary_attention_mask(
-        next_image_attention_mask, num_classes=max_num_images
-    )
-
-    result = {
-        "input_ids": output_input_ids,
-        "attention_mask": output_attention_masks,
-        "image_attention_mask": image_attention_mask,
-        "next_image_attention_mask": next_image_attention_mask,
-        "num_images": torch.tensor(output_num_images),
-        "num_text_tokens": torch.tensor(output_num_text_tokens),
-    }
-    if is_raw_images:
-        result["pixel_values"] = output_images
-    else:
-        result["image_embeddings"] = output_images
-    return result
-
-
-def split_and_pad_pmd(
-    sample,
-    tokenizer,
-    max_seq_len,
-    image_transform,
-    max_num_images,
-    prefix_seed=(0, 0),
-    is_blurred_fn=None,
-    blur_threshold=0.0,
-    prob_image_at_end=0.5,  # If 1, the <image> token is always added at the end of the text
-    # If set to -1, all padding will be tolerated. If set to 0, no padding will be tolerated.
-    padding_tolerance=-1,
-    add_begin_of_doc_token=False,
-    add_end_of_doc_token=True,
-):
-    if is_blurred_fn is None:
-        is_blurred_fn = fft_blur_detection
-
-    text_batch = sample["text"]
-    image_batch = sample.get("image_embedding", None)
-    is_raw_images = False
-    if image_batch is None:
-        image_batch = sample.get("image", None)
-        is_raw_images = True
-
-    filtered_image_batch = []
-    filtered_input_ids = []
-
-    # Define whether for the current PMD batch whether the images will be at the start or at the end.
-    rng = np.random.default_rng(seed=list(prefix_seed))
-    is_image_at_end = False
-
-    # rng.random is between 0 and 1, so if prob_image_at_end is 1, random value will
-    # always be less than `prob_image_at_end` and `is_image_at_end` will always be True.
-    # This means that images will always be at the end of the text.
-    if rng.random() < prob_image_at_end:
-        is_image_at_end = True
-
-    for image, text in zip(image_batch, text_batch):
-        if text is None or image is None:
-            continue
-
-        if is_raw_images and is_blurred_fn(image, threshold=blur_threshold):
-            continue
-
-        sample_text = f"{FAKE_TOKEN_AROUND_IMAGE_V2}{IMAGE_TOKEN}{FAKE_TOKEN_AROUND_IMAGE_V2}"
-
-        # Remove trailing and leading whitespaces, including newlines and tabs
-        text = text.strip()
-
-        if is_image_at_end:
-            sample_text = f"{text}{sample_text}"
-        else:
-            sample_text = f"{sample_text}{text}"
-
-        sample_input_ids = tokenizer.encode(sample_text, add_special_tokens=False)
-        if add_end_of_doc_token:
-            sample_input_ids += [tokenizer.eos_token_id]
-
-        if add_begin_of_doc_token:
-            sample_input_ids = [tokenizer.bos_token_id] + sample_input_ids
-
-        filtered_image_batch.append(image)
-        filtered_input_ids.append(sample_input_ids)
-
-    # sort by length of text and save same length elements in a mapping so we
-    # can retrieve candidates later.
-    filtered_image_batch, filtered_input_ids = zip(
-        *sorted(zip(filtered_image_batch, filtered_input_ids), key=lambda x: len(x[1]))
-    )
-    mapping_by_len = OrderedDict()
-    for i, sample_input_ids in enumerate(filtered_input_ids):
-        if len(sample_input_ids) not in mapping_by_len:
-            mapping_by_len[len(sample_input_ids)] = []
-        mapping_by_len[len(sample_input_ids)].append((filtered_image_batch[i], sample_input_ids))
-
-    all_images = []
-    all_texts = []
-    all_attention_masks = []
-    all_num_images = []
-    all_num_text_tokens = []
-    current_text = []
-    current_images = []
-
-    while True:
-        current_lens = list(mapping_by_len.keys())
-        if len(current_text) > 0:
-            # Now we try to do a binary search to find the biggest sequence that
-            # we can fit into the current sequence.
-            # This will eventually use up bigger sequences faster which is good
-            # and leave smaller sequences to pack with each other later.
-            diff = max_seq_len - len(current_text)
-            if len(current_lens) == 0:
-                possible_index = -1
-            else:
-                possible_index = bisect_left(current_lens, diff)
-                if possible_index == len(current_lens) or current_lens[possible_index] != diff:
-                    possible_index -= 1
-
-            if possible_index >= 0:
-                best_possible_length = current_lens[possible_index]
-                image, sample_input_ids = mapping_by_len[best_possible_length].pop(0)
-
-                # If we have used up all the samples of a certain length, remove
-                # that length from the mapping.
-                if len(mapping_by_len[best_possible_length]) == 0:
-                    del mapping_by_len[best_possible_length]
-                current_text.extend(sample_input_ids)
-                if is_raw_images:
-                    current_images.append(image_transform(image))
-                else:
-                    current_images.append(torch.tensor(image))
-            elif diff > padding_tolerance and padding_tolerance != -1:
-                # If we are here, it means that we still have padding left
-                # and we have exhausted our current unique options that will allow us to
-                # fill this sequence completely.
-                # So, we will try to fill the sequence with whatever we get from the unchanged
-                # copy of all sequences.
-                while diff > padding_tolerance:
-                    # Find a random sequence to fit
-                    # Why we need to add more stuff to prefix seed?
-                    # prefix_seed will be same in the same batch which means that it might sample
-                    # same thing again and again if there are multiple cases of padding in the
-                    # same batch which means we need to make this part as random as possible.
-                    rng = np.random.default_rng(
-                        prefix_seed
-                        + (
-                            diff,
-                            len(current_text),
-                            len(all_texts),
-                            all_num_images,
-                        )
-                    )
-                    choice = rng.choice(range(len(filtered_input_ids)))
-                    image, sample_input_ids = filtered_image_batch[choice], filtered_input_ids[choice]
-                    current_text.extend(sample_input_ids)
-                    if is_raw_images:
-                        current_images.append(image_transform(image))
-                    else:
-                        current_images.append(torch.tensor(image))
-                    diff = max_seq_len - len(current_text)
-                # In the next top-level while loop iteration, this should go into the else
-                # clause which should also handle the sequences longer than max_seq_len
-            else:
-                current_images = torch.stack(current_images)
-                padded_image_tensor = torch.zeros(max_num_images, *current_images.size()[1:])
-                padded_image_tensor[: current_images.size(0)] = current_images[
-                    : min(max_num_images, current_images.size(0))
-                ]
-                all_num_images.append(min(max_num_images, current_images.size(0)))
-                all_images.append(padded_image_tensor)
-
-                padded_input_ids = torch.full((max_seq_len,), tokenizer.pad_token_id)
-                current_max_len = min(max_seq_len, len(current_text))
-                padded_input_ids[:current_max_len] = torch.tensor(current_text)[:current_max_len]
-                all_num_text_tokens.append(current_max_len)
-                all_texts.append(padded_input_ids)
-
-                attention_mask = torch.zeros((max_seq_len,), dtype=torch.long)
-                attention_mask[: len(current_text)] = 1
-                all_attention_masks.append(attention_mask)
-
-                # Make sure to reset the current text and images.
-                current_images = []
-                current_text = []
-                if len(current_lens) == 0:
-                    break
-        else:
-            # A case where we might not have any samples left over after the initial filtering step.
-            if len(current_lens) == 0:
-                break
-            image, sample_input_ids = mapping_by_len[current_lens[-1]].pop(0)
-            if len(mapping_by_len[current_lens[-1]]) == 0:
-                del mapping_by_len[current_lens[-1]]
-            current_text = sample_input_ids[:max_seq_len]
-            if is_raw_images:
-                current_images = [image_transform(image)]
-            else:
-                current_images = [torch.tensor(image)]
-
-    if len(all_images) == 0 or len(all_texts) == 0:
-        result = {
-            "input_ids": torch.tensor([], dtype=torch.long),
-            "attention_mask": torch.tensor([], dtype=torch.bool),
-            "image_attention_mask": torch.tensor([], dtype=torch.bool),
-            "num_images": torch.tensor([], dtype=torch.long),
-            "num_text_tokens": torch.tensor([], dtype=torch.long),
-        }
-        if is_raw_images:
-            result["pixel_values"] = torch.tensor([], dtype=torch.float32)
-        else:
-            result["image_embeddings"] = torch.tensor([], dtype=torch.float32)
-        return result
-
-    all_texts = torch.stack(all_texts)
-    all_images = torch.stack(all_images)
-    all_attention_masks = torch.stack(all_attention_masks)
-
-    image_attention_mask, next_image_attention_mask = image_attention_mask_for_packed_input_ids(all_texts, tokenizer)
-    image_attention_mask = incremental_to_binary_attention_mask(image_attention_mask, num_classes=max_num_images)
-    next_image_attention_mask = incremental_to_binary_attention_mask(
-        next_image_attention_mask, num_classes=max_num_images
-    )
-
-    output = {
-        "input_ids": all_texts,
-        "attention_mask": all_attention_masks,
-        "image_attention_mask": image_attention_mask,
-        "num_images": torch.tensor(all_num_images),
-        "num_text_tokens": torch.tensor(all_num_text_tokens),
-    }
-    if is_raw_images:
-        output["pixel_values"] = all_images
-    else:
-        output["image_embeddings"] = all_images
-
-    if is_image_at_end:
-        # Set the correct attention mask based on whether the image is at the start
-        # or not. When it is at the end, we need next image attention mask.
-        output["image_attention_mask"] = next_image_attention_mask
-
-    return output
-
-
-# Copied from https://github.com/google-research/text-to-text-transfer-transformer/blob/main/t5/data/preprocessors.py
-def random_spans_helper(
-    inputs_length,
-    noise_density,
-    mean_noise_span_length,
-    extra_tokens_per_span_inputs,
-    extra_tokens_per_span_targets,
-    verbose=False,
-):
-    """Training parameters to avoid padding with random_spans_noise_mask.
-
-    When training a model with random_spans_noise_mask, we would like to set the
-    other training hyperparmeters in a way that avoids padding.  This function
-    helps us compute these hyperparameters.
-
-    We assume that each noise span in the input is replaced by
-    extra_tokens_per_span_inputs sentinel tokens, and each non-noise span in the
-    targets is replaced by extra_tokens_per_span_targets sentinel tokens.
-
-    This function tells us the required number of tokens in the raw example (for
-    split_tokens()) as well as the length of the encoded targets.
-
-    Note that this function assumes the inputs and targets will have EOS appended
-    and includes that in the reported length.
-
-    Args:
-      inputs_length: an integer - desired length of the tokenized inputs sequence
-      noise_density: a float
-      mean_noise_span_length: a float
-      extra_tokens_per_span_inputs: an integer
-      extra_tokens_per_span_targets: an integer
-      verbose: a bool indicating whether to log sequence lengths
-    Returns:
-      tokens_length: length of original text in tokens
-      targets_length: an integer - length in tokens of encoded targets sequence
-    """
-
-    if extra_tokens_per_span_inputs != 1:
-        raise NotImplementedError(
-            "extra_tokens_per_span_inputs != 1 not supported yet. You need to check"
-            " `get_model_tflops_per_batch_per_gpu` of `VT5ForConditionalGeneration` if you change it."
-        )
-    if extra_tokens_per_span_targets != 1:
-        raise NotImplementedError(
-            "extra_tokens_per_span_targets != 1 not supported yet. You need to check"
-            " `get_model_tflops_per_batch_per_gpu` of `VT5ForConditionalGeneration` if you change it."
-        )
-
-    def _tokens_length_to_inputs_length_targets_length(tokens_length):
-        num_noise_tokens = int(round(tokens_length * noise_density))
-        num_nonnoise_tokens = tokens_length - num_noise_tokens
-        num_noise_spans = int(round(num_noise_tokens / mean_noise_span_length))
-        # inputs contain all nonnoise tokens, sentinels for all noise spans
-        # and one EOS token.
-        return (
-            num_nonnoise_tokens + num_noise_spans * extra_tokens_per_span_inputs + 1,
-            num_noise_tokens + num_noise_spans * extra_tokens_per_span_targets + 1,
-        )
-
-    tokens_length = inputs_length - 1
-    while _tokens_length_to_inputs_length_targets_length(tokens_length + 1)[0] <= inputs_length:
-        tokens_length += 1
-    inputs_length, targets_length = _tokens_length_to_inputs_length_targets_length(tokens_length)
-    # minor hack to get the targets length to be equal to inputs length
-    # which is more likely to have been set to a nice round number.
-    if noise_density == 0.5 and targets_length > inputs_length:
-        tokens_length -= 1
-        targets_length -= 1
-    if verbose:
-        logging.info(
-            "tokens_length=%s inputs_length=%s targets_length=%s noise_density=%s mean_noise_span_length=%s ",
-            tokens_length,
-            inputs_length,
-            targets_length,
-            noise_density,
-            mean_noise_span_length,
-        )
-    return tokens_length, targets_length
diff --git a/m4/training/setup_language_model.py b/m4/training/setup_language_model.py
deleted file mode 100644
index b556b8e96b5b1438df7fede8157772016719e0cd..0000000000000000000000000000000000000000
--- a/m4/training/setup_language_model.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import re
-
-from m4.models.vbloom.configuration_vbloom import VBloomConfig
-from m4.models.vbloom.modeling_vbloom import VBloomForCausalLM
-from m4.models.vgpt2.configuration_vgpt2 import VGPT2Config
-from m4.models.vgpt2.modeling_vgpt2 import VGPT2LMHeadModel
-from m4.models.vgpt_neo.configuration_vgpt_neo import VGPTNeoConfig
-from m4.models.vgpt_neo.modeling_vgpt_neo import VGPTNeoForCausalLM
-from m4.models.vllama.configuration_vllama import VLlamaConfig
-from m4.models.vllama.modeling_vllama import VLlamaForCausalLM
-from m4.models.vopt.configuration_vopt import VOPTConfig
-from m4.models.vopt.modeling_vopt import VOPTForCausalLM
-from m4.models.vt5.configuration_vt5 import VT5Config
-from m4.models.vt5.modeling_vt5 import VT5ForConditionalGeneration
-
-
-model_name2classes = {
-    r"bloom|bigscience-small-testing": [VBloomConfig, VBloomForCausalLM],
-    r"gpt-neo|gptneo": [VGPTNeoConfig, VGPTNeoForCausalLM],
-    r"gpt2": [VGPT2Config, VGPT2LMHeadModel],
-    r"opt": [VOPTConfig, VOPTForCausalLM],
-    r"t5": [VT5Config, VT5ForConditionalGeneration],
-    r"llama": [VLlamaConfig, VLlamaForCausalLM],
-}
-
-
-def model_name_to_classes(model_name_or_path):
-    """returns config_class, model_class for a given model name or path"""
-
-    model_name_lowcase = model_name_or_path.lower()
-    for rx, classes in model_name2classes.items():
-        if re.search(rx, model_name_lowcase):
-            return classes
-    else:
-        raise ValueError(
-            f"Unknown type of backbone LM. Got {model_name_or_path}, supported regexes:"
-            f" {list(model_name2classes.keys())}."
-        )
diff --git a/m4/training/setup_vision_model.py b/m4/training/setup_vision_model.py
deleted file mode 100644
index ec2aa949780d9f05f7ce3859c5dc67e530f4928f..0000000000000000000000000000000000000000
--- a/m4/training/setup_vision_model.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import re
-
-from transformers import AutoModel
-
-
-# map to check the supported cv archs and also how to extract the model - in some arch, we want to
-# go through a specific prefix to get to the model as in `model.vision_model` for clip
-vision_model_name2model = {
-    r"clip": lambda model: model.vision_model,
-    r"vit": lambda model: model,
-}
-
-
-def vision_model_name_to_model(model_name_or_path, model):
-    """returns the model if supported, asserts otherwise"""
-
-    model_name_lowcase = model_name_or_path.lower()
-    for rx, lookup in vision_model_name2model.items():
-        if re.search(rx, model_name_lowcase):
-            return lookup(model)
-    else:
-        raise ValueError(
-            f"Unknown type of backbone vision model. Got {model_name_or_path}, supported regexes:"
-            f" {list(vision_model_name2model.keys())}."
-        )
-
-
-def get_vision_model(config):
-    vision_model_name = config.vision_model_name
-    vision_model_params = eval(config.vision_model_params)
-
-    model = AutoModel.from_pretrained(vision_model_name, **vision_model_params)
-    return vision_model_name_to_model(vision_model_name, model)
diff --git a/m4/training/types.py b/m4/training/types.py
deleted file mode 100644
index 391f6a72cf14a15d77c04f29eb74a8e587e61dd8..0000000000000000000000000000000000000000
--- a/m4/training/types.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from enum import Enum
-
-
-class DatasetNames(Enum):
-    PMD = "pmd"
-    LAION = "laion"
-    CM4 = "cm4"
-    WIKI = "wiki"
-
-
-class DatasetTypes(Enum):
-    WEB_DOCUMENTS = "wd"
-    IMAGE_CAPTION_PAIRS = "icp"
diff --git a/m4/training/utils.py b/m4/training/utils.py
deleted file mode 100644
index 16e2705b9f9c131c4905c331be3ecd93f2bc8774..0000000000000000000000000000000000000000
--- a/m4/training/utils.py
+++ /dev/null
@@ -1,539 +0,0 @@
-import dataclasses
-import gc
-import json
-import logging
-from contextlib import contextmanager
-from enum import Enum
-
-import accelerate
-import psutil
-import pynvml
-import torch
-import torch.nn as nn
-import torchvision.transforms as transforms
-from accelerate.state import AcceleratorState
-from PIL import Image
-from transformers import (  # AddedToken is needed for the eval of the tokenizer params # noqa: F401
-    AddedToken,
-    AutoTokenizer,
-)
-
-
-IMAGE_TOKEN = "<image>"
-FAKE_TOKEN_AROUND_IMAGE_V2 = "<fake_token_around_image>"
-FAKE_TOKEN_AROUND_IMAGE_V1 = "\n\n"
-# Originally taken from the values used in OpenCLIP
-IMAGE_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
-IMAGE_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
-logger = logging.getLogger(__name__)
-
-
-class LoggingTypes(Enum):
-    """Types of logging to use for the gradient and parameter statistics"""
-
-    JSONL = "jsonl"
-    WANDB = "wandb"
-    PRINT = "print"
-
-
-class JSONEncoderForDataclasses(json.JSONEncoder):
-    """
-    Use to serialize dataclass object, like so:
-    json.dump(data, fp, indent=2, cls=JSONEncoderForDataclasses)
-    """
-
-    def default(self, obj):
-        if dataclasses.is_dataclass(obj):
-            return dataclasses.asdict(obj)
-        return super().default(obj)
-
-
-def freeze_model(model, module_exceptions=[]):
-    mapping = {
-        "LayerNorm": nn.LayerNorm,
-        "Linear": nn.Linear,
-        "Embedding": nn.Embedding,
-    }
-    module_exceptions_mapped = [mapping[m] for m in module_exceptions]
-    for module in model.modules():
-        if module_exceptions and any([isinstance(module, t) for t in module_exceptions_mapped]):
-            module.requires_grad_(True)  # Explicitly setting it to true to avoid any mistakes
-        else:
-            module.requires_grad_(False)
-    return model
-
-
-def _convert_to_rgb(image):
-    # `image.convert("RGB")` would only work for .jpg images, as it creates
-    # a wrong background for transparent images. The call to `alpha_composite`
-    # handles this case
-    if image.mode == "RGB":
-        return image
-    image_rgba = image.convert("RGBA")
-    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
-    alpha_composite = Image.alpha_composite(background, image_rgba)
-    alpha_composite = alpha_composite.convert("RGB")
-    return alpha_composite
-
-
-# TODO(aps): Take parameters from config
-def build_image_transform(image_size=224, eval=False):
-    return transforms.Compose(
-        [
-            _convert_to_rgb,
-            (
-                transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC)
-                if eval
-                else transforms.RandomResizedCrop(
-                    (image_size, image_size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
-                )
-            ),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=IMAGE_DATASET_MEAN, std=IMAGE_DATASET_STD),
-        ]
-    )
-
-
-def get_tokenizer(
-    tokenizer_name: str,
-    tokenizer_add_tokens,
-    tokenizer_add_special_tokens,
-    tokenizer_params,
-    additional_vocab_size,
-    model_vocab_size=None,
-):
-    """
-    We artificially separate `tokenizer_add_tokens` and `tokenizer_add_special_tokens` is a dictionary whose keys only takes into account special tokens (eos, pad, cls, etc.).
-    On the contrary, `tokenizer_add_tokens` is a list of string of `AddedToken`.
-    In practise, we use `tokenizer.add_special_tokens` to add all of these new special tokens or update the existing ones.
-
-    NB: we constraint to tokenizer to be a fast tokenizer because with the slow tokenizer, we can't set the arguments of the added tokens (cf `.add_tokens`) and by default, the separators are stripped.
-    """
-    tokenizer_params = eval(tokenizer_params)
-    assert isinstance(tokenizer_params, dict)
-
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, **tokenizer_params)
-
-    if model_vocab_size is not None:
-        if model_vocab_size > len(tokenizer):
-            logger.warning(
-                f"The model vocabulary size ({model_vocab_size}) is larger than the tokenizer vocabulary size "
-                f"({len(tokenizer)}). Updating the tokenizer to match."
-            )
-            if "additional_special_tokens" in tokenizer_params:
-                raise ValueError(
-                    "You can't use `additional_special_tokens` in `tokenizer_params` with a model vocab "
-                    "size > tokenizer vocab size. We need to adjust tokenizer before adding special "
-                    "tokens. Please use `tokenizer_add_tokens` instead."
-                )
-            # We need to pad the tokenizer vocab with fake tokens
-            tokenizer.add_tokens(["<fake_token_{}>".format(i) for i in range(model_vocab_size - len(tokenizer))])
-
-    assert str(eval(tokenizer_add_tokens)[-1]) == IMAGE_TOKEN
-    assert str(eval(tokenizer_add_tokens)[-2]) == FAKE_TOKEN_AROUND_IMAGE_V2
-    # This check ensures that the image token and the fake token around it will be in the `DecoupledEmbedding.additional_weight`.
-    existing_special_tokens = (
-        [*tokenizer.special_tokens_map_extended["additional_special_tokens"]]
-        if "additional_special_tokens" in tokenizer.special_tokens_map_extended
-        else []
-    )
-    add_special_tokens_dict = {"additional_special_tokens": existing_special_tokens + eval(tokenizer_add_tokens)}
-    if tokenizer_add_special_tokens is not None:
-        add_special_tokens_dict.update(eval(tokenizer_add_special_tokens))
-
-    tokenizer.add_special_tokens(add_special_tokens_dict)
-
-    assert IMAGE_TOKEN in tokenizer.convert_ids_to_tokens(
-        [idx for idx in range(len(tokenizer) - additional_vocab_size, len(tokenizer))]
-    )
-    assert FAKE_TOKEN_AROUND_IMAGE_V2 in tokenizer.convert_ids_to_tokens(
-        [idx for idx in range(len(tokenizer) - additional_vocab_size, len(tokenizer))]
-    )
-    # This verifies that `<image>` was correctly added to the tokenizer vocabulary
-    # XXX: opt-1.3b fails here
-    # assert tokenizer.is_fast == tokenizer_params.get("use_fast", True)
-
-    return tokenizer
-
-
-def pynmvl_handle(accelerator):
-    if not torch.cuda.is_available():
-        return None
-
-    pynvml.nvmlInit()
-    return pynvml.nvmlDeviceGetHandleByIndex(accelerator.local_process_index)
-
-
-def pynvml_get_total_energy_in_joules(handle):
-    if not torch.cuda.is_available():
-        return 0
-    return pynvml.nvmlDeviceGetTotalEnergyConsumption(handle) / 1000
-
-
-def compute_tflops_per_batch_per_gpu(
-    num_layers,
-    batch_size,
-    q_seq_len,
-    k_seq_len,
-    hidden_size,
-    kv_in_dim,
-    ff_exp_factor=None,
-    grad_acc_size=1,
-    swiglu=False,
-    vocab_size=None,
-    count_backward=False,
-    use_grad_checkpointing=False,
-):
-    multiply_add_factor = torch.tensor(2)
-    query_transformation = multiply_add_factor * batch_size * q_seq_len * hidden_size**2
-    # k_seq_len == v_seq_len
-    key_value_transformation = multiply_add_factor * batch_size * k_seq_len * (2 * hidden_size * kv_in_dim)
-    attention_matrix_computation = multiply_add_factor * batch_size * q_seq_len * k_seq_len * hidden_size
-    attention_softmax = multiply_add_factor * q_seq_len * k_seq_len
-    att_over_values_computation = multiply_add_factor * batch_size * q_seq_len * k_seq_len * hidden_size
-    post_attention_linear_proj = multiply_add_factor * batch_size * q_seq_len * hidden_size**2
-
-    # There are usually 2 expansion_linear_layers because first one expands, and second one retracts back to hidden_size
-    # When using a classic decoder, some blocks don't have those feed-forward layers
-    # Swiglu duplicates the first linear layer, so we have to account for 3 of them instead of 2
-    if ff_exp_factor and swiglu:
-        expansion_linear_layers = 3 * (
-            multiply_add_factor * batch_size * q_seq_len * (hidden_size * ff_exp_factor) * hidden_size
-        )
-    elif ff_exp_factor:
-        expansion_linear_layers = 2 * (
-            multiply_add_factor * batch_size * q_seq_len * (hidden_size * ff_exp_factor) * hidden_size
-        )
-    else:
-        expansion_linear_layers = torch.tensor(0)
-
-    transformer_block_flops = (
-        query_transformation
-        + key_value_transformation
-        + attention_matrix_computation
-        + attention_softmax
-        + att_over_values_computation
-        + post_attention_linear_proj
-        + expansion_linear_layers
-    )
-
-    # This computation should only be added if the model has a language head
-    if vocab_size:
-        language_head_computation = multiply_add_factor * batch_size * q_seq_len * hidden_size * vocab_size
-    else:
-        language_head_computation = torch.tensor(0)
-
-    forward_fact = 1
-    backward_factor = 2 if count_backward else 0
-    grad_checkpointing_factor = 1 if use_grad_checkpointing else 0
-    model_flops = (forward_fact + backward_factor + grad_checkpointing_factor) * (
-        num_layers * transformer_block_flops + language_head_computation
-    )
-    model_tflops = model_flops / (10**12)
-
-    return model_tflops
-
-
-def compute_perceiver_tflops_per_batch_per_gpu(
-    num_layers,
-    batch_size,
-    q_seq_len,
-    vision_embed_seq_len,
-    q_k_v_input_dim,
-    attention_hidden_size,
-    ff_exp_factor=None,
-    count_backward=False,
-    use_grad_checkpointing=False,
-):
-    multiply_add_factor = torch.tensor(2)
-    query_transformation = multiply_add_factor * batch_size * q_seq_len * q_k_v_input_dim * attention_hidden_size
-    # k_seq_len == v_seq_len
-    key_value_transformation = (
-        multiply_add_factor * batch_size * vision_embed_seq_len * (2 * attention_hidden_size * q_k_v_input_dim)
-    )
-
-    k_seq_len = vision_embed_seq_len + q_seq_len
-    attention_matrix_computation = multiply_add_factor * batch_size * q_seq_len * k_seq_len * attention_hidden_size
-    attention_softmax = multiply_add_factor * q_seq_len * k_seq_len
-    att_over_values_computation = multiply_add_factor * batch_size * q_seq_len * k_seq_len * attention_hidden_size
-    post_attention_linear_proj = multiply_add_factor * batch_size * q_seq_len * attention_hidden_size * q_k_v_input_dim
-
-    # There are usually 2 expansion_linear_layers because first one expands, and second one retracts back to hidden_size
-    # When using a classic decoder, some blocks don't have those feed-forward layers
-    if ff_exp_factor:
-        expansion_linear_layers = 2 * (
-            multiply_add_factor * batch_size * q_seq_len * (q_k_v_input_dim * ff_exp_factor) * q_k_v_input_dim
-        )
-    else:
-        expansion_linear_layers = torch.tensor(0)
-
-    transformer_block_flops = (
-        query_transformation
-        + key_value_transformation
-        + attention_matrix_computation
-        + attention_softmax
-        + att_over_values_computation
-        + post_attention_linear_proj
-        + expansion_linear_layers
-    )
-
-    forward_fact = 1
-    backward_factor = 2 if count_backward else 0
-    grad_checkpointing_factor = 1 if use_grad_checkpointing else 0
-    model_flops = (forward_fact + backward_factor + grad_checkpointing_factor) * (num_layers * transformer_block_flops)
-    model_tflops = model_flops / (10**12)
-
-    return model_tflops
-
-
-def mem_usage_formatted(logging_type=LoggingTypes.PRINT):
-    # adapted from deepspeed's see_memory_usage
-
-    torch.cuda.empty_cache()
-
-    # python doesn't do real-time garbage collection so do it explicitly to get the correct usage reports
-    gc.collect()
-    vm_stats = psutil.virtual_memory()
-
-    mem = {
-        "gpu mem alloc": f"{torch.cuda.memory_allocated()/2**30:0.2f}GB",
-        "max alloc": f"{torch.cuda.max_memory_allocated()/2**30:0.2f}GB",
-        "reserv": f"{torch.cuda.memory_reserved()/2**30:0.2f}GB",
-        "max reserv": f"{torch.cuda.max_memory_reserved()/2**30:0.2f}GB",
-        "cpu vm used": f"{(vm_stats.total-vm_stats.available)/2**30:0.2f}GB {vm_stats.percent}%",
-    }
-
-    if logging_type == LoggingTypes.PRINT:
-        mem = " | ".join([f"{k}: {v}" for k, v in mem.items()]) + " | "
-
-    # get the peak memory to report correct data, so reset the max_memory_allocated counter for the next call
-    torch.cuda.reset_peak_memory_stats()
-
-    return mem
-
-
-def is_deepspeed_used():
-    deepspeed_plugin = get_deepspeed_plugin()
-    return deepspeed_plugin is not None
-
-
-def get_deepspeed_stage():
-    deepspeed_plugin = get_deepspeed_plugin()
-    if deepspeed_plugin is None:
-        return 0
-    ds_config = deepspeed_plugin.deepspeed_config
-    stage = ds_config.get("zero_optimization", {}).get("stage", 0)
-    # from accelerate>=0.17.1 can do instead:
-    # stage = deepspeed_plugin.zero_stage
-    return stage
-
-
-def is_deepspeed_zero3_used():
-    return get_deepspeed_stage() == 3
-
-
-def accelerate_torch_dtype():
-    """
-    derive and return `torch_dtype` to be used in `from_pretrained` from either Deepspeed config or if
-    Deepspeed isn't used than accelerator state
-    """
-    if not is_accelerate_initialized():
-        return None
-
-    accelerator_state = AcceleratorState()
-
-    if is_deepspeed_used():
-        deepspeed_plugin = accelerator_state.deepspeed_plugin
-        ds_config = deepspeed_plugin.deepspeed_config
-        if ds_config.get("fp16", {}).get("enabled", False):
-            torch_dtype = torch.float16
-        elif ds_config.get("bf16", {}).get("enabled", False):
-            torch_dtype = torch.bfloat16
-        else:
-            torch_dtype = None
-    else:  # no Deepspeed
-        if accelerator_state.mixed_precision == "fp16":
-            torch_dtype = torch.float16
-        elif accelerator_state.mixed_precision == "bf16":
-            torch_dtype = torch.bfloat16
-        else:
-            torch_dtype = None
-
-    return torch_dtype
-
-
-def is_accelerate_initialized():
-    return accelerate.state.is_initialized()
-
-
-def get_deepspeed_plugin():
-    if is_accelerate_initialized():
-        return AcceleratorState().deepspeed_plugin
-    else:
-        return None
-
-
-def get_deepspeed_engine(accelerator):
-    return accelerator.deepspeed_engine_wrapped.engine
-
-
-def is_deepspeed_zero_init_enabled():
-    deepspeed_plugin = get_deepspeed_plugin()
-    if deepspeed_plugin is not None:
-        return deepspeed_plugin.is_zero3_init_enabled()
-    else:
-        return False
-
-
-@contextmanager
-def hf_trainer_disable_zero3_init_context_manager():
-    # monkey patch hack to emulate a context that has zero_init disabled as it's used in
-    # modeling_utils.py in transformers for from_config and from_pretrained.
-    import transformers.modeling_utils  # noqa
-
-    orig = transformers.modeling_utils.is_deepspeed_zero3_enabled
-    transformers.modeling_utils.is_deepspeed_zero3_enabled = lambda: False
-    yield
-    transformers.modeling_utils.is_deepspeed_zero3_enabled = orig
-
-
-def deepspeed_zero_init_disabled_context_manager():
-    """
-    returns either a context list that includes one that will disable zero.Init or an empty context list
-    """
-    deepspeed_plugin = get_deepspeed_plugin()
-    if deepspeed_plugin is not None:
-        return [deepspeed_plugin.zero3_init_context_manager(enable=False)]
-    else:
-        return [hf_trainer_disable_zero3_init_context_manager()]
-
-
-def deepspeed_gathered_parameters_context_manager(params, modify=True):
-    """
-    Under zero.Init returns a context manager that will gather the sharded param, otherwise returns an empty list
-
-    If `modify` is `True`, gather the shards and once the context exits update the shards with the
-    modified data - one wants that when modifying the gathered param. If one wants to just gather
-    the shards in order to read the param and no modifications are done to it, use `modify=False` as
-    it's more efficient.
-
-    `params` - can be a single parameter, a list, or a tuple of parameters to collect.
-
-    Example:
-
-    from transformers.utils import ContextManagers
-    from m4.training.utils import deepspeed_gathered_parameters_context_manager
-    with ContextManagers(deepspeed_gathered_parameters_context_manager(module.weight, modify=True)):
-        module.weight.data.normal_(mean=0.0, std=std)
-        if module.padding_idx is not None:
-            module.weight.data[module.padding_idx].zero_()
-
-
-    """
-    if is_deepspeed_zero_init_enabled():
-        import deepspeed
-
-        # 0 is for updating `params` shards after modifying it, `None` is for read-only (only gather)
-        modifier_rank = 0 if modify else None
-        return [deepspeed.zero.GatheredParameters(params, modifier_rank=modifier_rank)]
-    else:
-        return []
-
-
-# adapted from https://github.com/huggingface/transformers/blob/a081f292ca8479eaf66d7396186021268f128829/src/transformers/modeling_utils.py#L438-L496
-# as it appears to be a private function
-def load_state_dict_into_model(model_to_load, state_dict, start_prefix):
-    # Convert old format to new format if needed from a PyTorch state_dict
-    old_keys = []
-    new_keys = []
-    for key in state_dict.keys():
-        new_key = None
-        if "gamma" in key:
-            new_key = key.replace("gamma", "weight")
-        if "beta" in key:
-            new_key = key.replace("beta", "bias")
-        if new_key:
-            old_keys.append(key)
-            new_keys.append(new_key)
-    for old_key, new_key in zip(old_keys, new_keys):
-        state_dict[new_key] = state_dict.pop(old_key)
-
-    # copy state_dict so _load_from_state_dict can modify it
-    metadata = getattr(state_dict, "_metadata", None)
-    state_dict = state_dict.copy()
-    if metadata is not None:
-        state_dict._metadata = metadata
-
-    error_msgs = []
-
-    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-    # so we need to apply the function recursively.
-    def load(module: torch.nn.Module, state_dict, prefix=""):
-        local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-        args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
-        # Parameters of module and children will start with prefix. We can exit early if there are none in this
-        # state_dict
-        if len([key for key in state_dict if key.startswith(prefix)]) > 0:
-            if is_deepspeed_zero_init_enabled():
-                import deepspeed
-
-                # In sharded models, each shard has only part of the full state_dict, so only gather
-                # parameters that are in the current state_dict.
-                named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
-                params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
-                if len(params_to_gather) > 0:
-                    # because zero3 puts placeholders in model params, this context
-                    # manager gathers (unpartitions) the params of the current layer, then loads from
-                    # the state dict and then re-partitions them again
-                    with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
-                        if torch.distributed.get_rank() == 0:
-                            module._load_from_state_dict(*args)
-            else:
-                module._load_from_state_dict(*args)
-
-        for name, child in module._modules.items():
-            if child is not None:
-                load(child, state_dict, prefix + name + ".")
-
-    load(model_to_load, state_dict, prefix=start_prefix)
-    # Delete `state_dict` so it could be collected by GC earlier. Note that `state_dict` is a copy of the argument, so
-    # it's safe to delete it.
-    del state_dict
-
-    return error_msgs
-
-
-def get_stats(var, ctx):
-    if var is None:
-        return {}
-    var = var.float()
-    abs_var = var.abs()
-    return {
-        f"{ctx}_var_min": var.min().item(),
-        f"{ctx}_var_max": var.max().item(),
-        f"{ctx}_var_mean": var.mean().item(),
-        f"{ctx}_var_std": var.std().item(),
-        f"{ctx}_abs_var_min": abs_var.min().item(),
-        f"{ctx}_abs_var_max": abs_var.max().item(),
-        f"{ctx}_abs_var_mean": abs_var.mean().item(),
-        f"{ctx}_abs_var_std": abs_var.std().item(),
-        f"{ctx}_var_norm_2": (var.norm(p=2) / var.numel()).item(),
-        f"{ctx}_var_norm_1": (var.norm(p=1) / var.numel()).item(),
-        f"{ctx}_nonzero": (var != 0).sum().item(),
-    }
-
-
-def get_stats_format(ctx):
-    return {
-        f"{ctx}_var_min": "e",
-        f"{ctx}_var_max": "e",
-        f"{ctx}_var_mean": "e",
-        f"{ctx}_var_std": "e",
-        f"{ctx}_abs_var_min": "e",
-        f"{ctx}_abs_var_max": "e",
-        f"{ctx}_abs_var_mean": "e",
-        f"{ctx}_abs_var_std": "e",
-        f"{ctx}_var_norm_2": "e",
-        f"{ctx}_var_norm_1": "e",
-        f"{ctx}_nonzero": "",
-    }
diff --git a/m4/utils/__init__.py b/m4/utils/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/m4/utils/activation_tracker.py b/m4/utils/activation_tracker.py
deleted file mode 100644
index 77b897ad136f211cc5157ad4079435e2061fa936..0000000000000000000000000000000000000000
--- a/m4/utils/activation_tracker.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Adapted from https://github.com/huggingface/transformers/blob/f93c90d21749b61bd89152a7fe99a839df29ed94/src/transformers/debug_utils.py
-"""
-
-import json
-
-from transformers.utils import ExplicitEnum, is_torch_available, logging
-
-from m4.training.utils import get_stats
-
-
-if is_torch_available():
-    import torch
-
-
-logger = logging.get_logger(__name__)
-
-
-class ActivationTracker:
-    """
-    This debug class helps detect and understand where the model starts getting very large or very small, and more
-    importantly `nan` or `inf` activation elements.
-
-    This class will plug hooks into the model and record the activation values of the model into a list of dictionaries: `jsonl_stats`.
-
-    Recording is only active during training, not during validation, and when `trace_activation` is set to True.
-    In practise, since this tracking requires additional computation, we only track activations every X steps.
-
-    In the case of gradient accumulation, all the batches being accumulated are being recorded and identified by the `batch_idx` key.
-
-    Args:
-        model (`nn.Module`):
-            The model to debug.
-        abort_after_batch_num  (`int``, *optional*):
-            Whether to abort after a certain batch number has finished
-    """
-
-    def __init__(
-        self,
-        model,
-        abort_after_batch_num=None,
-    ):
-        self.model = model
-        self.is_validation = False
-        self.abort_after_batch_num = abort_after_batch_num
-
-        self.jsonl_stats = []
-        self.batch_number = 0
-        self.detected_overflow = False
-        self.analyse_model()
-
-        self.register_forward_hook()
-
-    def analyse_model(self):
-        # extract the fully qualified module names, to be able to report at run time. e.g.:
-        # encoder.block.2.layer.0.SelfAttention.o
-        #
-        # for shared weights only the first shared module name will be registered
-        self.module_names = {m: name for name, m in self.model.named_modules()}
-
-    def analyse_variable(self, var, ctx, current_module_stats):
-        if torch.is_tensor(var):
-            dict_stats = get_stats(var, ctx)
-            current_module_stats.update(dict_stats)
-            # self.expand_frame(text_stats)
-            if detect_overflow(var, ctx):
-                self.detected_overflow = True
-        return current_module_stats
-
-    def create_frame(self, module, input, output):
-        module_name = f"{self.module_names[module]}"
-        module_type = f"{module.__class__.__name__}"
-        current_module_stats = {}
-
-        # inputs
-        if isinstance(input, tuple):
-            for i, x in enumerate(input):
-                current_module_stats = self.analyse_variable(x, f"input[{i}]", current_module_stats)
-        else:
-            current_module_stats = self.analyse_variable(input, "input", current_module_stats)
-
-        # outputs
-        if isinstance(output, tuple):
-            for i, x in enumerate(output):
-                # possibly a tuple of tuples
-                if isinstance(x, tuple):
-                    for j, y in enumerate(x):
-                        current_module_stats = self.analyse_variable(y, f"output[{i}][{j}]", current_module_stats)
-                else:
-                    current_module_stats = self.analyse_variable(x, f"output[{i}]", current_module_stats)
-        else:
-            current_module_stats = self.analyse_variable(output, "output", current_module_stats)
-        if current_module_stats:
-            # When we activate gradient checkpointing, the forward hook will be called twice for some (not all) modules.
-            # That will lead to double (repeated) entries in the list.
-            # This is a hack to avoid these double entries.
-            if (module_name, module_type) not in [(x["name"], x["type"]) for x in self.jsonl_stats]:
-                self.jsonl_stats.append(
-                    {
-                        "name": module_name,
-                        "type": module_type,
-                        **current_module_stats,
-                    }
-                )
-
-    def register_forward_hook(self):
-        self.model.apply(self._register_forward_hook)
-
-    def _register_forward_hook(self, module):
-        module.register_forward_hook(self.forward_hook)
-
-    def forward_hook(self, module, input, output):
-        # - input is a tuple of packed inputs (could be non-Tensors)
-        # - output could be a Tensor or a tuple of Tensors and non-Tensors
-
-        trace_activation = self.trace_activation
-
-        # count batch numbers - the very first forward hook of the batch will be called when the
-        # batch completes - i.e. it gets called very last - we know this batch has finished
-        if module == self.model:
-            self.batch_number += 1
-
-        if trace_activation and not self.is_validation:
-            self.create_frame(module, input, output)
-
-        if self.detected_overflow:
-            # now we can abort, as it's pointless to continue running
-            raise ValueError(
-                "DebugUnderflowOverflow: inf/nan detected, aborting as there is no point running further. "
-                "Please scroll up above this traceback to see the activation values prior to this event."
-            )
-
-        # abort after certain batch if requested to do so
-        if self.abort_after_batch_num is not None and self.batch_number > self.abort_after_batch_num:
-            raise ValueError(
-                f"DebugUnderflowOverflow: aborting after {self.batch_number} batches due to"
-                f" `abort_after_batch_num={self.abort_after_batch_num}` arg"
-            )
-
-    def fill_in_batch_idx(self, batch_idx):
-        if not self.jsonl_stats:
-            return
-        for r in self.jsonl_stats:
-            if "batch_idx" not in r:
-                r["batch_idx"] = batch_idx
-            else:
-                if not (r["batch_idx"] <= batch_idx):
-                    raise ValueError("`batch_idx` should be increasing")
-
-    def dump_stats(self, log_activations_filename, curr_opt_step):
-        with open(log_activations_filename, "a") as file:
-            # append stats to file
-            for r in self.jsonl_stats:
-                r["step"] = curr_opt_step
-                file.write(json.dumps(r) + "\n")
-
-    def reset_jsonl_stats(self):
-        self.jsonl_stats = []
-
-    def activate_hooks(self):
-        self.trace_activation = True
-
-    def deactivate_hooks(self):
-        self.trace_activation = False
-
-    def is_eval(self):
-        self.is_validation = True
-
-    def is_train(self):
-        self.is_validation = False
-
-
-def detect_overflow(var, ctx):
-    """
-    Report whether the tensor contains any `nan` or `inf` entries.
-
-    This is useful for detecting overflows/underflows and best to call right after the function that did some math that
-    modified the tensor in question.
-
-    This function contains a few other helper features that you can enable and tweak directly if you want to track
-    various other things.
-
-    Args:
-        var: the tensor variable to check
-        ctx: the message to print as a context
-
-    Return:
-        `True` if `inf` or `nan` was detected, `False` otherwise
-    """
-    detected = False
-    if torch.isnan(var).any().item():
-        detected = True
-        print(f"{ctx} has nans")
-    if torch.isinf(var).any().item():
-        detected = True
-        print(f"{ctx} has infs")
-
-    # if needed to monitor large elements can enable the following
-    if 0:  # and detected:
-        n100 = var[torch.ge(var.abs(), 100)]
-        if n100.numel() > 0:
-            print(f"{ctx}:  n100={n100.numel()}")
-        n1000 = var[torch.ge(var.abs(), 1000)]
-        if n1000.numel() > 0:
-            print(f"{ctx}: n1000={n1000.numel()}")
-        n10000 = var[torch.ge(var.abs(), 10000)]
-        if n10000.numel() > 0:
-            print(f"{ctx}: n10000={n10000.numel()}")
-
-    if 0:
-        print(f"min={var.min():9.2e} max={var.max():9.2e}")
-
-    if 0:
-        print(f"min={var.min():9.2e} max={var.max():9.2e} var={var.var():9.2e} mean={var.mean():9.2e} ({ctx})")
-
-    return detected
-
-
-class DebugOption(ExplicitEnum):
-    UNDERFLOW_OVERFLOW = "underflow_overflow"
-    TPU_METRICS_DEBUG = "tpu_metrics_debug"
diff --git a/m4/utils/debug.py b/m4/utils/debug.py
deleted file mode 100644
index 26c092ffe8d3ef312b3fd2f685d4abfc1bf3de14..0000000000000000000000000000000000000000
--- a/m4/utils/debug.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import builtins
-import fcntl
-
-
-def printflock(*args, **kwargs):
-    """
-    This is a wrapper around the built-in Python `print` which calls `flock` before calling
-    `print` and unlocks it immediately after. This wrapper is useful for when each rank needs to
-    print a message without getting it interleaved with prints from other ranks.
-    The lock file is the file this wrapper is defined in.
-    The output order will be random per rank.
-
-    Example:
-        >>> # assuming 4 GPUs
-        >>> world_size = dist.get_world_size()
-        >>> rank = dist.get_rank()
-        >>> printflock(f"This is a very long message from rank {rank}/{world_size}")
-       This is a very long message from rank 0/4
-       This is a very long message from rank 2/4
-       This is a very long message from rank 3/4
-       This is a very long message from rank 1/4
-
-    It can also be used to override normal `print` for an easier multi-gpu debug:
-
-        from m4.utils.debug import printflock as print
-
-    and then you don't need to change anything in your code, the normal `print` calls will all be non-interleaved
-    """
-
-    with open(__file__, "r") as fh:
-        fcntl.flock(fh, fcntl.LOCK_EX)
-        try:
-            builtins.print(*args, **kwargs)
-        finally:
-            fcntl.flock(fh, fcntl.LOCK_UN)
diff --git a/m4/utils/logging.py b/m4/utils/logging.py
deleted file mode 100644
index 81ba6c96db36d29b95cb741c26e426e8d49a8f86..0000000000000000000000000000000000000000
--- a/m4/utils/logging.py
+++ /dev/null
@@ -1,262 +0,0 @@
-import functools
-import logging
-import os
-import sys
-import threading
-from logging import CRITICAL  # NOQA
-from logging import DEBUG  # NOQA
-from logging import ERROR  # NOQA
-from logging import FATAL  # NOQA
-from logging import INFO  # NOQA
-from logging import NOTSET  # NOQA
-from logging import WARN  # NOQA
-from logging import WARNING  # NOQA
-from typing import Optional
-
-
-_lock = threading.Lock()
-_default_handler: Optional[logging.Handler] = None
-
-log_levels = {
-    "debug": logging.DEBUG,
-    "info": logging.INFO,
-    "warning": logging.WARNING,
-    "error": logging.ERROR,
-    "critical": logging.CRITICAL,
-}
-
-_default_log_level = logging.WARNING
-
-_tqdm_active = True
-
-
-def _get_default_logging_level():
-    """
-    If M4_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
-    not - fall back to `_default_log_level`
-    """
-    env_level_str = os.getenv("M4_VERBOSITY", None)
-    if env_level_str:
-        if env_level_str in log_levels:
-            return log_levels[env_level_str]
-        else:
-            logging.getLogger().warning(
-                f"Unknown option M4_VERBOSITY={env_level_str}, has to be one of: { ', '.join(log_levels.keys()) }"
-            )
-    return _default_log_level
-
-
-def _get_library_name() -> str:
-    return __name__.split(".")[0]
-
-
-def _get_library_root_logger() -> logging.Logger:
-    return logging.getLogger(_get_library_name())
-
-
-def _configure_library_root_logger() -> None:
-    global _default_handler
-
-    with _lock:
-        if _default_handler:
-            # This library has already configured the library root logger.
-            return
-        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
-        _default_handler.flush = sys.stderr.flush
-
-        # Apply our default configuration to the library root logger.
-        library_root_logger = _get_library_root_logger()
-        library_root_logger.addHandler(_default_handler)
-        library_root_logger.setLevel(_get_default_logging_level())
-        library_root_logger.propagate = False
-
-
-def _reset_library_root_logger() -> None:
-    global _default_handler
-
-    with _lock:
-        if not _default_handler:
-            return
-
-        library_root_logger = _get_library_root_logger()
-        library_root_logger.removeHandler(_default_handler)
-        library_root_logger.setLevel(logging.NOTSET)
-        _default_handler = None
-
-
-def get_log_levels_dict():
-    return log_levels
-
-
-def get_logger(name: Optional[str] = None) -> logging.Logger:
-    """
-    Return a logger with the specified name.
-
-    This function is not supposed to be directly accessed unless you are writing a custom m4 module.
-    """
-
-    if name is None:
-        name = _get_library_name()
-
-    _configure_library_root_logger()
-    return logging.getLogger(name)
-
-
-def get_verbosity() -> int:
-    """
-    Return the current level for the 🤗 M4's root logger as an int.
-
-    Returns:
-        `int`: The logging level.
-
-    <Tip>
-
-    🤗 M4 has following logging levels:
-
-    - 50: `m4.logging.CRITICAL` or `m4.logging.FATAL`
-    - 40: `m4.logging.ERROR`
-    - 30: `m4.logging.WARNING` or `m4.logging.WARN`
-    - 20: `m4.logging.INFO`
-    - 10: `m4.logging.DEBUG`
-
-    </Tip>"""
-
-    _configure_library_root_logger()
-    return _get_library_root_logger().getEffectiveLevel()
-
-
-def set_verbosity(verbosity: int) -> None:
-    """
-    Set the verbosity level for the 🤗 M4's root logger.
-
-    Args:
-        verbosity (`int`):
-            Logging level, e.g., one of:
-
-            - `m4.logging.CRITICAL` or `m4.logging.FATAL`
-            - `m4.logging.ERROR`
-            - `m4.logging.WARNING` or `m4.logging.WARN`
-            - `m4.logging.INFO`
-            - `m4.logging.DEBUG`
-    """
-
-    _configure_library_root_logger()
-    _get_library_root_logger().setLevel(verbosity)
-
-
-def set_verbosity_info():
-    """Set the verbosity to the `INFO` level."""
-    return set_verbosity(INFO)
-
-
-def set_verbosity_warning():
-    """Set the verbosity to the `WARNING` level."""
-    return set_verbosity(WARNING)
-
-
-def set_verbosity_debug():
-    """Set the verbosity to the `DEBUG` level."""
-    return set_verbosity(DEBUG)
-
-
-def set_verbosity_error():
-    """Set the verbosity to the `ERROR` level."""
-    return set_verbosity(ERROR)
-
-
-def disable_default_handler() -> None:
-    """Disable the default handler of the HuggingFace M4's root logger."""
-
-    _configure_library_root_logger()
-
-    assert _default_handler is not None
-    _get_library_root_logger().removeHandler(_default_handler)
-
-
-def enable_default_handler() -> None:
-    """Enable the default handler of the HuggingFace M4's root logger."""
-
-    _configure_library_root_logger()
-
-    assert _default_handler is not None
-    _get_library_root_logger().addHandler(_default_handler)
-
-
-def add_handler(handler: logging.Handler) -> None:
-    """adds a handler to the HuggingFace M4's root logger."""
-
-    _configure_library_root_logger()
-
-    assert handler is not None
-    _get_library_root_logger().addHandler(handler)
-
-
-def remove_handler(handler: logging.Handler) -> None:
-    """removes given handler from the HuggingFace M4's root logger."""
-
-    _configure_library_root_logger()
-
-    assert handler is not None and handler not in _get_library_root_logger().handlers
-    _get_library_root_logger().removeHandler(handler)
-
-
-def disable_propagation() -> None:
-    """
-    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
-    """
-
-    _configure_library_root_logger()
-    _get_library_root_logger().propagate = False
-
-
-def enable_propagation() -> None:
-    """
-    Enable propagation of the library log outputs. Please disable the HuggingFace M4's default handler to
-    prevent double logging if the root logger has been configured.
-    """
-
-    _configure_library_root_logger()
-    _get_library_root_logger().propagate = True
-
-
-def enable_explicit_format() -> None:
-    """
-    Enable explicit formatting for every HuggingFace M4's logger. The explicit formatter is as follows:
-    ```
-        [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
-    ```
-    All handlers currently bound to the root logger are affected by this method.
-    """
-    handlers = _get_library_root_logger().handlers
-
-    for handler in handlers:
-        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
-        handler.setFormatter(formatter)
-
-
-def reset_format() -> None:
-    """
-    Resets the formatting for HuggingFace M4's loggers.
-
-    All handlers currently bound to the root logger are affected by this method.
-    """
-    handlers = _get_library_root_logger().handlers
-
-    for handler in handlers:
-        handler.setFormatter(None)
-
-
-@functools.lru_cache(None)
-def warning_once(self, *args, **kwargs):
-    """
-    This method is identical to `logger.warning()`, but will emit the warning with the same message only once
-
-    Note: The cache is for the function arguments, so 2 different callers using the same arguments
-    will hit the cache. The assumption here is that all warning messages are unique across the code.
-    If they aren't then need to switch to another type of cache that includes the caller frame
-    information in the hashing function.
-    """
-    self.warning(*args, **kwargs)
-
-
-logging.Logger.warning_once = warning_once
diff --git a/m4/utils/progress.py b/m4/utils/progress.py
deleted file mode 100644
index 19da7e2e6a88ad82dbc7018a2b136bbaf98870ea..0000000000000000000000000000000000000000
--- a/m4/utils/progress.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import os
-import time
-
-
-M4_DISABLE_RICH = False
-if os.environ.get("M4_DISABLE_RICH", "") == "1":
-    M4_DISABLE_RICH = True
-else:
-    try:
-        import rich  # noqa
-    except ModuleNotFoundError:
-        M4_DISABLE_RICH = True
-
-if not M4_DISABLE_RICH:
-    from rich.progress import BarColumn, MofNCompleteColumn, Progress, TaskProgressColumn, TimeElapsedColumn
-
-else:
-    # This is a simple equivalent of some of the `rich`'s classes we use but which doesn't use
-    # `rich` or any formatting. We use it if there is no `rich` installed or during HPC training
-    # where we don't use a live console and log to a file instead - so we want easy to read logs and
-    # `rich`'s output mangling causes more trouble than it helps.
-
-    class BarColumn:
-        def render(self, task):
-            return ""
-
-    class MofNCompleteColumn:
-        def render(self, task):
-            if task.total_steps is not None:
-                total_steps = task.total_steps
-            else:
-                total_steps = "UNK"
-            return f"{task.completed}/{total_steps}"
-
-    class TaskProgressColumn:
-        def render(self, task):
-            if task.total_steps is not None:
-                percent = int(task.completed / task.total_steps * 100)
-                return f"{percent:>3}%"
-            else:
-                return "UNK%"
-
-    class TimeElapsedColumn:
-        def render(self, task):
-            time_diff = time.gmtime(time.time() - task.start_time)
-            days = int(time.strftime("%j", time_diff)) - 1
-            time_str = time.strftime("%H:%M:%S", time_diff)
-            return f"{days}:{time_str}"
-
-    class Task:
-        def __init__(self, description, total_steps, *args, **kwargs):
-            self.description = description
-            self.total_steps = total_steps
-
-            self.completed = 0
-            self.start_time = time.time()
-
-        def step(self, advance_steps):
-            self.completed += advance_steps
-
-    class Progress:
-        def __init__(self, *args, **kwargs):
-            self.tasks = []
-            self.description = "Progress"
-
-        def __enter__(self):
-            return self
-
-        def __exit__(self, *args):
-            pass
-
-        def update(self, task, advance):
-            task.step(advance)
-            return self
-
-        def add_task(self, description, total, *args, **kwargs):
-            task = Task(description, total)
-            self.tasks.append(task)
-            return task