ydshieh
/

flax-vision-encoder-decoder-vit-gpt2-coco-en

Model card Files Files and versions Community

ydshieh commited on Dec 16, 2021

Commit

5306066

•

1 Parent(s): 283180e

upload debug.py

Browse files

Files changed (1) hide show

debug.py +1343 -0

debug.py ADDED Viewed

	@@ -0,0 +1,1343 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for summarization.
+"""
+# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
+import json
+import logging
+import os
+import sys
+import time
+from dataclasses import dataclass, field
+import datetime
+from functools import partial
+from pathlib import Path
+from typing import Callable, Optional
+import datasets
+import nltk  # Here to have a nice missing dependency error message early on
+import numpy as np
+from datasets import Dataset, load_dataset, load_metric
+from tqdm import tqdm
+from PIL import Image
+import jax
+import jax.numpy as jnp
+import optax
+import transformers
+from filelock import FileLock
+from flax import jax_utils, traverse_util
+from flax.jax_utils import unreplicate
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
+from huggingface_hub import Repository
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING,
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoTokenizer,
+    HfArgumentParser,
+    TrainingArguments,
+    is_tensorboard_available,
+    FlaxAutoModelForVision2Seq,
+)
+from transformers.file_utils import get_full_repo_name, is_offline_mode
+logger = logging.getLogger(__name__)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+try:
+    nltk.data.find("tokenizers/punkt")
+except (LookupError, OSError):
+    if is_offline_mode():
+        raise LookupError(
+            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
+        )
+    with FileLock(".lock") as lock:
+        nltk.download("punkt", quiet=True)
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+# Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
+def shift_tokens_right(input_ids: np.ndarray, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = np.zeros_like(input_ids)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1]
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    return shifted_input_ids
+@dataclass
+class CustomTrainingArguments(TrainingArguments):
+    do_predict_during_training: bool = field(default=None, metadata={"help": "???"})
+    do_predict_after_evaluation: bool = field(default=None, metadata={"help": "???"})
+    block_size: int = field(default=None, metadata={"help": "???"})
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    encoder_model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The encoder model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    decoder_model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The decoder model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default='vision-encoder-decoder',
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    encoder_model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a encoder model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    decoder_model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a decoder model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    encoder_config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as encoder_model_name"}
+    )
+    decoder_config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as decoder_model_name"}
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained feature extractor_name name or path if not the same as encoder_model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as decoder_model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    data_dir: Optional[str] = field(
+        default=None, metadata={"help": "The data directory of the dataset to use (via the datasets library)."}
+    )
+    image_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the full image file paths (for image captioning)."},
+    )
+    caption_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the image captions (for image captioning)."},
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input predict data file to do prediction on (a text file)."},
+    )
+    max_source_length: Optional[int] = field(
+        default=1024,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    max_target_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    val_max_target_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+            "This argument is also used to override the `max_length` param of `model.generate`, which is used "
+            "during evaluation."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    predict_with_generate: bool = field(
+        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
+    )
+    num_beams: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Number of beams to use for evaluation. This argument will be passed to `model.generate`, "
+            "which is used during evaluation."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+        if self.val_max_target_length is None:
+            self.val_max_target_length = self.max_target_length
+image_captioning_name_mapping = {
+    "image_caption_dataset.py": ("image_file", "caption"),
+}
+class TrainState(train_state.TrainState):
+    dropout_rng: jnp.ndarray
+    def replicate(self):
+        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
+def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuffle: bool = False):
+    """
+    Returns batches of size `batch_size` from truncated `dataset`, sharded over all local devices.
+    Shuffle batches if `shuffle` is `True`.
+    """
+    steps_per_epoch = len(dataset) // batch_size
+    if shuffle:
+        batch_idx = jax.random.permutation(rng, len(dataset))
+    else:
+        batch_idx = jnp.arange(len(dataset))
+    batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.
+    batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))
+    for idx in batch_idx:
+        batch = dataset[idx]
+        batch = {k: jnp.array(v) for k, v in batch.items()}
+        batch = shard(batch)
+        yield batch
+def write_metric(summary_writer, mode, metrics, step, train_time=None):
+    if train_time:
+        summary_writer.scalar("train_time", train_time, step)
+    if mode == "train":
+        metrics = get_metrics(metrics)
+        for key, vals in metrics.items():
+            tag = f"{mode}_{key}"
+            for i, val in enumerate(vals):
+                summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+    elif mode in ["valid", "pred"]:
+        for metric_name, value in metrics.items():
+            summary_writer.scalar(f"{mode}_{metric_name}", value, step)
+def create_learning_rate_fn(
+    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    steps_per_epoch = train_ds_size // train_batch_size
+    num_train_steps = steps_per_epoch * num_train_epochs
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, CustomTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        repo = Repository(training_args.output_dir, clone_from=repo_name)
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files this script will use the first column for the full texts and the second column for the
+    # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments).
+    #
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, keep_in_memory=False, data_dir=data_args.data_dir,
+            cache_dir="./dataset_cache/"
+        )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        # TODO: Check
+        dataset = load_dataset(extension, data_files=data_files, cache_dir="./dataset_cache/", data_dir=data_args.data_dir, )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    encoder_cache_dir, decoder_cache_dir = None, None
+    if model_args.cache_dir:
+        encoder_cache_dir = os.path.join(model_args.cache_dir, "encoder")
+        decoder_cache_dir = os.path.join(model_args.cache_dir, "decoder")
+    # Use explicit specified config
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    # Use pretrained model's config
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    # Use specified `model_type` (default to `vision-encoder-decoder`)
+    else:
+        if not model_args.model_type in MODEL_TYPES:
+            raise ValueError(
+                f"Unrecognized model identifier: {model_args.model_type}. Should contain one of {', '.join(MODEL_TYPES)}."
+            )
+        config_class = CONFIG_MAPPING[model_args.model_type]
+        # Deal with encoder-decoder models that require specifying encoder/decoder
+        if hasattr(config_class, "from_encoder_decoder_configs"):
+            # Use explicit specified encoder config
+            if model_args.encoder_config_name:
+                encoder_config = AutoConfig.from_pretrained(model_args.encoder_config_name, cache_dir=encoder_cache_dir)
+            # Use pretrained encoder model's config
+            elif model_args.encoder_model_name_or_path:
+                encoder_config = AutoConfig.from_pretrained(model_args.encoder_model_name_or_path, cache_dir=encoder_cache_dir)
+            # Use specified encoder model type
+            elif model_args.encoder_model_type:
+                encoder_config = AutoConfig.for_model(model_args.encoder_model_type)
+                logger.warning("You are instantiating a new config instance from scratch for the encoder.")
+            else:
+                raise ValueError("Encoder Config: if pretrained config or model location is not provided, `encoder_model_type` is required.")
+            # Use explicit specified decoder config
+            if model_args.decoder_config_name:
+                decoder_config = AutoConfig.from_pretrained(model_args.decoder_config_name, cache_dir=decoder_cache_dir)
+            # Use pretrained decoder model's config
+            elif model_args.decoder_model_name_or_path:
+                decoder_config = AutoConfig.from_pretrained(model_args.decoder_model_name_or_path, cache_dir=decoder_cache_dir)
+            # Use specified decoder model type
+            elif model_args.decoder_model_type:
+                decoder_config = AutoConfig.for_model(model_args.decoder_model_type)
+                logger.warning("You are instantiating a new config instance from scratch for the decoder.")
+            else:
+                raise ValueError("Decoder Config: if pretrained config or model location is not provided, `decoder_model_type` is required.")
+            logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
+            decoder_config.is_decoder = True
+            decoder_config.add_cross_attention = True
+            config = config_class.from_encoder_decoder_configs(encoder_config, decoder_config)
+        # For self-contained model
+        else:
+            config = config_class()
+            logger.warning("You are instantiating a new config instance from scratch.")
+    decoder_start_token_id = getattr(config, "decoder_start_token_id", None)
+    if not decoder_start_token_id and getattr(config, "decoder", None):
+        decoder_start_token_id = getattr(config.decoder, "decoder_start_token_id", None)
+    bos_token_id = getattr(config, "bos_token_id", None)
+    if not bos_token_id and getattr(config, "decoder", None):
+        bos_token_id = getattr(config.decoder, "bos_token_id", None)
+    eos_token_id = getattr(config, "eos_token_id", None)
+    if not eos_token_id and getattr(config, "decoder", None):
+        eos_token_id = getattr(config.decoder, "eos_token_id", None)
+    pad_token_id = getattr(config, "pad_token_id", None)
+    if not pad_token_id and getattr(config, "decoder", None):
+        pad_token_id = getattr(config.decoder, "pad_token_id", None)
+    if decoder_start_token_id is None:
+        decoder_start_token_id = bos_token_id
+    if pad_token_id is None:
+        pad_token_id = eos_token_id
+    if getattr(config, "decoder", None):
+        config.decoder.decoder_start_token_id = decoder_start_token_id
+        config.decoder.bos_token_id = bos_token_id
+        config.decoder.eos_token_id = eos_token_id
+        config.decoder.pad_token_id = pad_token_id
+    # Set `encoder-decoder` (top-level) specific config
+    config.decoder_start_token_id = decoder_start_token_id
+    config.bos_token_id = bos_token_id
+    config.eos_token_id = eos_token_id
+    config.pad_token_id = pad_token_id
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForVision2Seq.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        # model_class = FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING[config.__class__]
+        model = FlaxAutoModelForVision2Seq.from_config(config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype))
+        model_class = model.__class__
+        # encoder_class = FlaxAutoModel
+        # decoder_class = FlaxAutoModelForCausalLM
+        module = model.module.bind(model.params)
+        encoder_class_name = type(module.encoder).__name__.replace("Module", "Model")
+        decoder_class_name = type(module.decoder).__name__.replace("Module", "Model")
+        encoder_class = getattr(transformers, encoder_class_name, None)
+        decoder_class = getattr(transformers, decoder_class_name, None)
+        if hasattr(model_class, "from_encoder_decoder_pretrained"):
+            if model_args.encoder_model_name_or_path:
+                encoder = encoder_class.from_pretrained(
+                    model_args.encoder_model_name_or_path,
+                    config=config.encoder,
+                    seed=training_args.seed,
+                    dtype=getattr(jnp, model_args.dtype)
+                )
+            else:
+                encoder = encoder_class(config=config.encoder, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype))
+                logger.warning("You are instantiating a new model instance from scratch for the encoder.")
+            if model_args.decoder_model_name_or_path:
+                decoder = decoder_class.from_pretrained(
+                    model_args.decoder_model_name_or_path,
+                    config=config.decoder,
+                    seed=training_args.seed,
+                    dtype=getattr(jnp, model_args.dtype)
+                )
+            else:
+                decoder = decoder_class(config=config.decoder, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype))
+                logger.warning("You are instantiating a new model instance from scratch for the decoder.")
+            model = model_class.from_encoder_decoder_pretrained(
+                model_args.encoder_model_name_or_path,
+                model_args.decoder_model_name_or_path,
+                encoder_model=encoder,
+                decoder_model=decoder,
+                encoder_config=config.encoder,
+                decoder_config=config.decoder,
+                encoder_seed=training_args.seed,
+                decoder_seed=training_args.seed,
+                encoder_dtype=getattr(jnp, model_args.dtype),
+                decoder_dtype=getattr(jnp, model_args.dtype),
+            )
+            # Set `encoder-decoder` (top-level) specific config
+            model.config.decoder_start_token_id = decoder_start_token_id
+            model.config.bos_token_id = bos_token_id
+            model.config.eos_token_id = eos_token_id
+            model.config.pad_token_id = pad_token_id
+        else:
+            logger.warning("You are instantiating a new model instance from scratch.")
+    feature_extractor = None
+    if model_args.feature_extractor_name:
+        feature_extractor = AutoFeatureExtractor.from_pretrained(
+            model_args.feature_extractor_name, cache_dir=model_args.cache_dir,
+        )
+    elif model_args.model_name_or_path:
+        try:
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                model_args.model_name_or_path, cache_dir=model_args.cache_dir
+            )
+        except ValueError as e:
+            logger.warning(e)
+    # Check encoder
+    if not feature_extractor:
+        if model_args.encoder_model_name_or_path:
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                model_args.encoder_model_name_or_path, cache_dir=model_args.cache_dir
+            )
+        else:
+            raise ValueError(
+                "You are instantiating a new feature extractor from scratch. This is not supported by this script."
+                "You can do it from another script, save it, and load it from here, using --feature_extractor_name."
+            )
+    def get_tokenizer():
+        tokenizer = None
+        if model_args.tokenizer_name:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+            )
+        elif model_args.model_name_or_path:
+            try:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+                )
+            except ValueError as e:
+                logger.warning(e)
+        # Check decoder
+        if not tokenizer:
+            if model_args.decoder_model_name_or_path:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_args.decoder_model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+                )
+            else:
+                raise ValueError(
+                    "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+                    "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+                )
+        tokenizer.pad_token = tokenizer.convert_ids_to_tokens(config.pad_token_id)
+        return tokenizer
+    tokenizer = get_tokenizer()
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    if training_args.do_train:
+        column_names = dataset["train"].column_names
+    elif training_args.do_eval:
+        column_names = dataset["validation"].column_names
+    elif training_args.do_predict:
+        column_names = dataset["test"].column_names
+    else:
+        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
+        return
+    # Get the column names for input/target.
+    dataset_columns = image_captioning_name_mapping.get(data_args.dataset_name, None)
+    if data_args.image_column is None:
+        assert dataset_columns is not None
+        image_column = dataset_columns[0]
+    else:
+        image_column = data_args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{data_args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if data_args.caption_column is None:
+        assert dataset_columns is not None
+        caption_column = dataset_columns[1]
+    else:
+        caption_column = data_args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{data_args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    # In Flax, for seq2seq models we need to pass `decoder_input_ids`
+    # as the Flax models don't accept `labels`, we need to prepare the decoder_input_ids here
+    # for that dynamically import the `shift_tokens_right` function from the model file
+    model_module = __import__(model.__module__, fromlist=["shift_tokens_right"])
+    shift_tokens_right_fn = getattr(model_module, "shift_tokens_right", shift_tokens_right)
+    def filter_fn(examples):
+        bools = []
+        for image_file in examples[image_column]:
+            with Image.open(image_file) as image:
+                try:
+                    feature_extractor(images=image, return_tensors="np")
+                    bools.append(True)
+                except:
+                    bools.append(False)
+        return bools
+    # Setting padding="max_length" as we need fixed length inputs for jitted functions
+    def tokenization_fn(examples, max_target_length):
+        captions = []
+        for caption in examples[caption_column]:
+                captions.append(caption.lower() + ' ' + tokenizer.eos_token)
+        targets = captions
+        model_inputs = {}
+        # Setup the tokenizer for targets
+        with tokenizer.as_target_tokenizer():
+            labels = tokenizer(
+                targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np"
+            )
+        model_inputs["labels"] = labels["input_ids"]
+        decoder_input_ids = shift_tokens_right_fn(
+            labels["input_ids"], config.pad_token_id, config.decoder_start_token_id
+        )
+        model_inputs["decoder_input_ids"] = np.asarray(decoder_input_ids)
+        # We need decoder_attention_mask so we can ignore pad tokens from loss
+        model_inputs["decoder_attention_mask"] = labels["attention_mask"]
+        model_inputs[image_column] = examples[image_column]
+        return model_inputs
+    def feature_extraction_fn(examples):
+        pixel_values = []
+        for image_file in examples[image_column]:
+            with Image.open(image_file) as image:
+                encoder_inputs = feature_extractor(images=image, return_tensors="np")
+                pixel_values.append(encoder_inputs.pixel_values)
+        pixel_values = np.concatenate(pixel_values)
+        model_inputs = examples
+        model_inputs['pixel_values'] = pixel_values
+        return model_inputs
+    features = datasets.Features(
+        {
+            "pixel_values": datasets.Array3D(
+                shape=(
+                    getattr(config.encoder, "num_channels", 3),
+                    config.encoder.image_size,
+                    config.encoder.image_size,
+                ),
+                dtype="float32",
+            ),
+            "labels": datasets.Sequence(feature=datasets.Value(dtype='int32', id=None), length=-1, id=None),
+            "decoder_input_ids": datasets.Sequence(feature=datasets.Value(dtype='int32', id=None), length=-1, id=None),
+            "decoder_attention_mask": datasets.Sequence(feature=datasets.Value(dtype='int32', id=None), length=-1, id=None),
+        }
+    )
+    if training_args.do_train:
+        if "train" not in dataset:
+             raise ValueError("--do_train requires a train dataset")
+        train_dataset = dataset["train"]
+        train_dataset = datasets.concatenate_datasets([train_dataset] * 205)
+        # remove problematic examples
+        s = time.time()
+        train_dataset = train_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
+        e = time.time()
+        print(f'filter time: {e-s}')
+        print(len(train_dataset))
+        rng = jax.random.PRNGKey(training_args.seed)
+        rng, input_rng = jax.random.split(rng)
+        s = time.time()
+        indices_jax = jax.random.permutation(input_rng, len(train_dataset))
+        e = time.time()
+        print(f'get permutation indices for the whole dataset with jax - time: {e-s}')
+        s = time.time()
+        indices_np = np.random.permutation(len(train_dataset))
+        e = time.time()
+        print(f'get permutation indices for the whole dataset with np - time: {e-s}')
+        # indices = jnp.arange(len(ds))
+        block_size = 4096
+        for idx in range(4):
+            start_idx = block_size * idx
+            end_idx = block_size * (idx + 1)
+            s = time.time()
+            selected_indices_jax = indices_jax[start_idx:end_idx]
+            e = time.time()
+            print(f'get block indices with jax - time: {e-s}')
+            print(type(selected_indices_jax))
+            s = time.time()
+            selected_indices_np = indices_np[start_idx:end_idx]
+            e = time.time()
+            print(f'get block indices with np - time: {e-s}')
+            print(type(selected_indices_np))
+            s = time.time()
+            _ds = train_dataset.select(selected_indices_jax)
+            e = time.time()
+            print(f'select block with jax - time: {e-s}')
+            s = time.time()
+            _ds = train_dataset.select(selected_indices_np)
+            e = time.time()
+            print(f'select block with np - time: {e-s}')
+            s = time.time()
+            _selected_indices_np = np.array(selected_indices_jax)
+            e = time.time()
+            print(f'convert jax to np - time: {e-s}')
+            batch_size = 256
+            steps_per_epoch = len(_ds) // batch_size
+            s = time.time()
+            batch_idx_jax = jax.random.permutation(rng, len(_ds))
+            e = time.time()
+            print(f'get permutation indices for the block with jax - time: {e-s}')
+            # batch_idx = jnp.arange(len(dataset))
+            s = time.time()
+            batch_idx_np = np.random.permutation(len(_ds))
+            e = time.time()
+            print(f'get permutation indices for the block with np - time: {e-s}')
+            s = time.time()
+            batch_idx_jax = batch_idx_jax[: steps_per_epoch * batch_size]  # Skip incomplete batch.
+            e = time.time()
+            print(f'skip incomplete batch with jax - time: {e-s}')
+            s = time.time()
+            batch_idx_np = batch_idx_np[: steps_per_epoch * batch_size]  # Skip incomplete batch.
+            e = time.time()
+            print(f'skip incomplete batch with np - time: {e-s}')
+            s = time.time()
+            batch_idx_jax = batch_idx_jax.reshape((steps_per_epoch, batch_size))
+            e = time.time()
+            print(f'reshape block indices with jax - time: {e-s}')
+            s = time.time()
+            batch_idx_np = batch_idx_np.reshape((steps_per_epoch, batch_size))
+            e = time.time()
+            print(f'reshape block indices with np - time: {e-s}')
+            for idx in batch_idx_jax:
+                s = time.time()
+                batch = _ds[idx]
+                e = time.time()
+                print(f'get one batch with jax - time: {e-s}')
+                #s = time.time()
+                #batch = {k: jnp.array(v) for k, v in batch.items()}
+                #e = time.time()
+                #print(f'convert one batch to jnp time: {e-s}')
+            for idx in batch_idx_np:
+                s = time.time()
+                batch = _ds[idx]
+                e = time.time()
+                print(f'get one batch with np - time: {e-s}')
+        exit(0)
+    if training_args.do_predict:
+        if "test" not in dataset:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_dataset = dataset["test"]
+        # remove problematic examples
+        predict_dataset = predict_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        predict_dataset = predict_dataset.map(
+            tokenization_fn,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            # kept image paths
+            remove_columns=[x for x in column_names if x != image_column],
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc=f"Running tokenizer on prediction dataset",
+            fn_kwargs={"max_target_length": data_args.val_max_target_length},
+        )
+    tokenizer = get_tokenizer()
+    # Split the dataset into several chunks - each chunk is processed (.map) without cache to create a
+    # data loader separately (in a sequential order).
+    block_size = training_args.block_size
+    # Store some constant
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    if training_args.do_train:
+        steps_per_epoch = len(train_dataset) // train_batch_size
+        num_train_examples_per_epoch = steps_per_epoch * train_batch_size
+        num_epochs = int(training_args.num_train_epochs)
+        total_train_steps = steps_per_epoch * num_epochs
+    else:
+        num_train_examples_per_epoch = 0
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    if training_args.do_eval:
+        num_eval_examples = len(eval_dataset)
+        eval_steps = num_eval_examples // eval_batch_size + int(num_eval_examples % eval_batch_size > 0)
+    if training_args.do_predict:
+        num_test_examples = len(predict_dataset)
+        test_steps = num_test_examples // eval_batch_size + int(num_test_examples % eval_batch_size > 0)
+    def get_batch_iter(rng, ds, block_size, batch_size, shuffle=False, drop_last_batch=False, keep_in_memory=False, split=""):
+        if not block_size:
+            block_size = len(ds)
+        steps_per_split = block_size // batch_size
+        num_examples = len(ds)
+        steps = num_examples // batch_size + int(num_examples % batch_size > 0 and not drop_last_batch)
+        num_splits = steps // steps_per_split + int(steps % steps_per_split > 0)
+        if shuffle:
+            indices = jax.random.permutation(input_rng, len(ds))
+        else:
+            indices = jnp.arange(len(ds))
+        for idx in range(num_splits):
+            start_idx = block_size * idx
+            end_idx = block_size * (idx + 1)
+            selected_indices = indices[start_idx:end_idx]
+            _ds = ds.select(selected_indices)
+            names = {
+                "train": "train",
+                "valid": "validation",
+                "test": "prediction",
+            }
+            _ds =_ds.map(
+                feature_extraction_fn,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=[image_column],
+                load_from_cache_file=not data_args.overwrite_cache,
+                features=features,
+                keep_in_memory=keep_in_memory,
+                desc=f"Running feature extraction on {names[split]} dataset".replace("  ", " "),
+            )
+            _ds = _ds.with_format("numpy")
+            # No need to shuffle here
+            loader = data_loader(rng, _ds, batch_size=batch_size, shuffle=False)
+            for batch in loader:
+                yield batch
+    # Metric
+    metric = load_metric("rouge")
+    def postprocess_text(preds, labels):
+        preds = [pred.strip() for pred in preds]
+        labels = [label.strip() for label in labels]
+        # rougeLSum expects newline after each sentence
+        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
+        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
+        return preds, labels
+    def compute_metrics(preds, labels):
+        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+        # Some simple post-processing
+        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+        result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+        # Extract a few results from ROUGE
+        result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
+        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
+        result["gen_len"] = np.mean(prediction_lens)
+        result = {k: round(v, 6) for k, v in result.items()}
+        return result, decoded_preds, decoded_labels
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    rng, dropout_rng = jax.random.split(rng)
+    # Create learning rate schedule
+    linear_decay_lr_schedule_fn = create_learning_rate_fn(
+        num_train_examples_per_epoch,
+        train_batch_size,
+        training_args.num_train_epochs,
+        training_args.warmup_steps,
+        training_args.learning_rate,
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBart.
+    # For FlaxT5, one should correct the layer norm parameter naming
+    # accordingly - see `run_t5_mlm_flax.py` e.g.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        layer_norm_params = [
+            (name, "scale") for name in ["self_attn_layer_norm", "layernorm_embedding", "final_layer_norm"]
+        ]
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_params) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    adamw = optax.adamw(
+        learning_rate=linear_decay_lr_schedule_fn,
+        b1=training_args.adam_beta1,
+        b2=training_args.adam_beta2,
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        mask=decay_mask_fn,
+    )
+    # Setup train state
+    state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng)
+    # label smoothed cross entropy
+    def loss_fn(logits, labels, padding_mask, label_smoothing_factor=0.0):
+        """
+        The label smoothing implementation is adapted from Flax's official example:
+        https://github.com/google/flax/blob/87a211135c6a377c8f29048a1cac3840e38b9da4/examples/wmt/train.py#L104
+        """
+        vocab_size = logits.shape[-1]
+        confidence = 1.0 - label_smoothing_factor
+        low_confidence = (1.0 - confidence) / (vocab_size - 1)
+        normalizing_constant = -(
+            confidence * jnp.log(confidence) + (vocab_size - 1) * low_confidence * jnp.log(low_confidence + 1e-20)
+        )
+        soft_labels = onehot(labels, vocab_size, on_value=confidence, off_value=low_confidence)
+        loss = optax.softmax_cross_entropy(logits, soft_labels)
+        loss = loss - normalizing_constant
+        # ignore padded tokens from loss
+        loss = loss * padding_mask
+        loss = loss.sum() / padding_mask.sum()
+        return loss
+    # Define gradient update step fn
+    def train_step(state, batch, label_smoothing_factor=0.0):
+        dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)
+        def compute_loss(params):
+            labels = batch.pop("labels")
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            loss = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor)
+            return loss
+        grad_fn = jax.value_and_grad(compute_loss)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng)
+        metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        return new_state, metrics
+    # Define eval fn
+    def eval_step(params, batch, label_smoothing_factor=0.0):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+        loss = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor)
+        # summarize metrics
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        return metrics
+    # Define generation function
+    max_length = (
+        data_args.val_max_target_length if data_args.val_max_target_length is not None else model.config.max_length
+    )
+    num_beams = data_args.num_beams if data_args.num_beams is not None else model.config.num_beams
+    gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
+    def generate_step(params, batch):
+        model.params = params
+        output_ids = model.generate(batch['pixel_values'], **gen_kwargs)
+        return output_ids.sequences
+    # Create parallel version of the train and eval step
+    p_train_step = jax.pmap(
+        partial(train_step, label_smoothing_factor=training_args.label_smoothing_factor), "batch", donate_argnums=(0,)
+    )
+    p_eval_step = jax.pmap(partial(eval_step, label_smoothing_factor=training_args.label_smoothing_factor), "batch")
+    p_generate_step = jax.pmap(generate_step, "batch")
+    # Replicate the train state on each device
+    state = state.replicate()
+    if training_args.do_train:
+        logger.info("***** Running training *****")
+        logger.info(f"  Num train examples = {len(train_dataset)}")
+        logger.info(f"  Num train examples per epoch = {num_train_examples_per_epoch}")
+        logger.info(f"  Num Epochs = {num_epochs}")
+        logger.info(f"  Instantaneous train batch size per device = {training_args.per_device_train_batch_size}")
+        logger.info(f"  Total train batch size (w. parallel & distributed) = {train_batch_size}")
+        logger.info(f"  Optimization steps per epoch = {steps_per_epoch}")
+        logger.info(f"  Total optimization steps = {total_train_steps}")
+    if training_args.do_eval:
+        logger.info(f"  Num evaluation examples = {num_eval_examples}")
+        logger.info(f"  Instantaneous evaluation batch size per device = {training_args.per_device_eval_batch_size}")
+        logger.info(f"  Total evaluation batch size (w. parallel & distributed) = {eval_batch_size}")
+        logger.info(f"  Evaluation steps = {eval_steps}")
+    if training_args.do_predict:
+        logger.info(f"  Num test examples = {num_test_examples}")
+        logger.info(f"  Instantaneous test batch size per device = {training_args.per_device_eval_batch_size}")
+        logger.info(f"  Total test batch size (w. parallel & distributed) = {eval_batch_size}")
+        logger.info(f"  Total train batch size (w. parallel & distributed) = {eval_batch_size}")
+        logger.info(f"  Test steps = {test_steps}")
+    # create output directory
+    if not os.path.isdir(os.path.join(training_args.output_dir)):
+        os.makedirs(os.path.join(training_args.output_dir), exist_ok=True)
+    def save_results(epoch, step):
+        # save checkpoint after each epoch and push checkpoint to the hub
+        if jax.process_index() == 0:
+            params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+            dir_name = f'ckpt_epoch_{epoch + 1}_step_{step}'
+            model.save_pretrained(os.path.join(training_args.output_dir, dir_name), params=params)
+            tokenizer.save_pretrained(os.path.join(training_args.output_dir, dir_name))
+            if training_args.push_to_hub:
+                commit_msg = f"Saving weights and logs of epoch {epoch + 1}- step {step}"
+                repo.push_to_hub(commit_message=commit_msg, blocking=False)
+    def run_eval_or_test(rng, dataset, name, is_inside_training=True):
+        if name not in ["valid", "test"]:
+            raise ValueError(f"`name` must be either \"valid\" or \"test\". Got {name} instead.")
+        logger.info(f"*** {'Predict' if name == 'test' else 'Evaluate'} ***")
+        metrics = []
+        preds = []
+        labels = []
+        batches = get_batch_iter(rng, dataset, block_size=block_size, batch_size=eval_batch_size, keep_in_memory=False, shuffle=False, split=name)
+        steps = len(dataset) // eval_batch_size + int(len(dataset) % eval_batch_size > 0)
+        for _ in tqdm(range(steps), desc=f"{'Predicting' if name == 'test' else 'Evaluating'}...", position=2, leave=False):
+            # Model forward
+            batch = next(batches)
+            _labels = batch.get("labels", None)
+            if name == "valid" and _labels is None:
+                raise ValueError("Validation dataset requires `labels`")
+            if _labels is not None:
+                _metrics = p_eval_step(state.params, batch)
+                metrics.append(_metrics)
+            # generation
+            if data_args.predict_with_generate:
+                generated_ids = p_generate_step(state.params, batch)
+                preds.extend(jax.device_get(generated_ids.reshape(-1, gen_kwargs["max_length"])))
+                if _labels is not None:
+                    labels.extend(jax.device_get(_labels.reshape(-1, _labels.shape[-1])))
+        if metrics:
+            # normalize metrics
+            metrics = get_metrics(metrics)
+            metrics = jax.tree_map(jnp.mean, metrics)
+        # compute ROUGE metrics
+        generations = []
+        rouge_desc = ""
+        if data_args.predict_with_generate:
+            if labels:
+                rouge_metrics, decoded_preds, decoded_labels = compute_metrics(preds, labels)
+                metrics.update(rouge_metrics)
+                rouge_desc = " ".join([f"{'Predict' if name == 'test' else 'Eval'} {key}: {value} |" for key, value in rouge_metrics.items()])
+                for pred, label in zip(decoded_preds, decoded_labels):
+                    pred = pred.replace("\n", " ")
+                    label = label.replace("\n", " ")
+                    generations.append({"label": label, "pred": pred})
+            else:
+                decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+                # Some simple post-processing
+                decoded_preds = [pred.strip() for pred in decoded_preds]
+                # rougeLSum expects newline after each sentence
+                decoded_preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in decoded_preds]
+                for pred in decoded_preds:
+                    pred = pred.replace("\n", " ")
+                    generations.append({"pred": pred})
+        if metrics:
+            # Print metrics and update progress bar
+            desc = f"{'Predict' if name == 'test' else 'Eval'} Loss: {metrics['loss']} | {rouge_desc})"
+            if is_inside_training:
+                desc = f"Epoch... ({epoch + 1}/{num_epochs} | Step: {cur_step} | " + desc
+                epochs.write(desc)
+                epochs.desc = desc
+            logger.info(desc)
+        if jax.process_index() == 0:
+            ckpt_dir = ""
+            if is_inside_training:
+                ckpt_dir = f'ckpt_epoch_{epoch + 1}_step_{cur_step}'
+                if not os.path.isdir(os.path.join(training_args.output_dir, ckpt_dir)):
+                    os.makedirs(os.path.join(training_args.output_dir, ckpt_dir), exist_ok=True)
+            if metrics:
+                # save final metrics in json
+                metrics = {f"{name}_{metric_name}": round(value.item(), 6) for metric_name, value in metrics.items()}
+                path = os.path.join(training_args.output_dir, ckpt_dir, f"{name}_results.json")
+                with open(path, "w") as f:
+                    json.dump(metrics, f, indent=4, sort_keys=True)
+                # Update report
+                with open(os.path.join(training_args.output_dir, 'report.txt'), 'a', encoding='UTF-8') as fp:
+                    fp.write(desc + '\n')
+                # Save metrics
+                if has_tensorboard and is_inside_training:
+                    write_metric(summary_writer, name, metrics, cur_step)
+            # Save generations
+            if generations:
+                with open(os.path.join(training_args.output_dir, ckpt_dir, f'generation_{name}.json'), 'w', encoding='UTF-8') as fp:
+                    json.dump(generations, fp, ensure_ascii=False, indent=4)
+    input_rng = None
+    if training_args.do_train:
+        cur_step = 0
+        train_time = 0
+        epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+        for epoch in epochs:
+            # ======================== Training ================================
+            # Create sampling rng
+            rng, input_rng = jax.random.split(rng)
+            train_metrics = []
+            train_batches = get_batch_iter(input_rng, train_dataset, block_size=block_size, batch_size=train_batch_size, keep_in_memory=True, shuffle=True, drop_last_batch=training_args.dataloader_drop_last, split="train")
+            # train
+            for (batch_idx, _) in enumerate(tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False)):
+                cur_step += 1
+                batch = next(train_batches)
+                batch_start = time.time()
+                state, train_metric = p_train_step(state, batch)
+                train_metrics.append(train_metric)
+                train_time += time.time() - batch_start
+                if cur_step % training_args.logging_steps == 0 or (training_args.eval_steps is not None and cur_step % training_args.eval_steps == 0) or cur_step % steps_per_epoch == 0:
+                    time_per_step = train_time / cur_step
+                    _train_metric = unreplicate(train_metric)
+                    desc = f"Epoch... ({epoch + 1}/{num_epochs} | Step: {cur_step} | Loss: {_train_metric['loss']} | Learning Rate: {_train_metric['learning_rate']} | Time per step: {time_per_step})"
+                    epochs.desc = desc
+                    epochs.write(desc)
+                    logger.info(desc)
+                    with open(os.path.join(training_args.output_dir, 'report.txt'), 'a', encoding='UTF-8') as fp:
+                        fp.write(desc + '\n')
+                    # Save metrics
+                    if has_tensorboard and jax.process_index() == 0:
+                        write_metric(summary_writer, "train", train_metrics, cur_step, train_time=train_time)
+                    # ======================== Evaluating ==============================
+                    if training_args.do_eval and ((training_args.eval_steps is not None and cur_step % training_args.eval_steps) or cur_step % steps_per_epoch == 0):
+                        run_eval_or_test(input_rng, eval_dataset, name="valid", is_inside_training=True)
+                        # ======================== Prediction loop ==============================
+                        # run prediction after evaluation if specified, otherwise only after each epoch
+                        if training_args.do_predict and training_args.do_predict_during_training and training_args.do_predict_after_evaluation:
+                            run_eval_or_test(input_rng, predict_dataset, name='test', is_inside_training=True)
+                    # ======================== Save ==============================
+                    save_results(epoch + 1, cur_step)
+            # run prediction after each epoch (if not done during training)
+            if training_args.do_predict and training_args.do_predict_during_training and not training_args.do_predict_after_evaluation:
+                run_eval_or_test(input_rng, predict_dataset, name='test', is_inside_training=True)
+                save_results(epoch + 1, cur_step)
+    # Create sampling rng
+    if input_rng is None:
+        rng, input_rng = jax.random.split(rng)
+    # run prediction after each epoch (if not done during training)
+    if training_args.do_predict and not (training_args.do_train and training_args.do_predict_during_training):
+        run_eval_or_test(input_rng, predict_dataset, name='test', is_inside_training=False)
+if __name__ == "__main__":
+    main()