flax-community
/

vit-gpt2

TensorBoard

Model card Files Files and versions Metrics Training metrics Community

ydshieh commited on Oct 3, 2021

Commit

3bffad7

•

1 Parent(s): a1a9885

Use new FlaxVisionEncoderDecoderModel class

Browse files

Files changed (2) hide show

run_image_caption.py +255 -101
run_summarization_flax.py +265 -100

run_image_caption.py CHANGED Viewed

@@ -18,11 +18,6 @@ Fine-tuning the library models for summarization.
 """
 # You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
-import sys, os
-current_path = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(current_path)
 import logging
 import os
 import sys
@@ -48,20 +43,21 @@ from flax import jax_utils, traverse_util
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
 from transformers import (
     CONFIG_MAPPING,
-    FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
     AutoConfig,
     AutoTokenizer,
     FlaxAutoModelForSeq2SeqLM,
     HfArgumentParser,
     TrainingArguments,
     is_tensorboard_available,
 )
-from transformers.file_utils import is_offline_mode
-from transformers import ViTFeatureExtractor, GPT2Tokenizer, GPT2Config
-from vit_gpt2.modeling_flax_vit_gpt2_lm import FlaxViTGPT2LMForConditionalGeneration
 logger = logging.getLogger(__name__)
@@ -76,10 +72,23 @@ except (LookupError, OSError):
         nltk.download("punkt", quiet=True)
-MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 @dataclass
 class ModelArguments:
     """
@@ -93,15 +102,46 @@ class ModelArguments:
             "Don't set if you want to train a model from scratch."
         },
     )
     model_type: Optional[str] = field(
         default=None,
         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
     )
     config_name: Optional[str] = field(
         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
     )
     tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
     )
     cache_dir: Optional[str] = field(
         default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
@@ -130,19 +170,26 @@ class DataTrainingArguments:
     dataset_config_name: Optional[str] = field(
         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
-    text_column: Optional[str] = field(
         default=None,
-        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
     )
-    summary_column: Optional[str] = field(
         default=None,
-        metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
     )
     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
     validation_file: Optional[str] = field(
         default=None,
         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
     )
     max_source_length: Optional[int] = field(
         default=1024,
         metadata={
@@ -191,9 +238,6 @@ class DataTrainingArguments:
         default=None,
         metadata={"help": "The number of processes to use for the preprocessing."},
     )
-    source_prefix: Optional[str] = field(
-        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
-    )
     predict_with_generate: bool = field(
         default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
     )
@@ -222,18 +266,8 @@ class DataTrainingArguments:
             self.val_max_target_length = self.max_target_length
-summarization_name_mapping = {
-    "amazon_reviews_multi": ("review_body", "review_title"),
-    "big_patent": ("description", "abstract"),
-    "cnn_dailymail": ("article", "highlights"),
-    "orange_sum": ("text", "summary"),
-    "pn_summary": ("article", "summary"),
-    "psc": ("extract_text", "summary_text"),
-    "samsum": ("dialogue", "summary"),
-    "thaisum": ("body", "summary"),
-    "xglue": ("news_body", "news_title"),
-    "xsum": ("document", "summary"),
-    "wiki_summary": ("article", "highlights"),
 }
@@ -337,6 +371,16 @@ def main():
     # Set the verbosity to info of the Transformers logger (on main process only):
     logger.info(f"Training/evaluation parameters {training_args}")
     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
     # (the dataset will be downloaded automatically from the datasets Hub).
@@ -347,7 +391,7 @@ def main():
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
-            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False, data_dir='/home/33611/caption/'
         )
     else:
         data_files = {}
@@ -360,38 +404,152 @@ def main():
         if data_args.test_file is not None:
             data_files["test"] = data_args.test_file
             extension = data_args.test_file.split(".")[-1]
-        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
-    vit_name_path = 'google/vit-base-patch16-224-in21k'
-    gpt2_name_path = 'asi/gpt-fr-cased-small'
-    gpt2_config = GPT2Config.from_pretrained(gpt2_name_path)
-    gpt2_config.add_cross_attention = True
-    vit_gpt2_name_path = ''
-    feature_extractor = ViTFeatureExtractor.from_pretrained(vit_name_path)
-    tokenizer = GPT2Tokenizer.from_pretrained(gpt2_name_path)
-    if not vit_gpt2_name_path:
-        assert vit_name_path
-        assert gpt2_name_path
-        vit_gpt2_model = FlaxViTGPT2LMForConditionalGeneration.from_vit_gpt2_pretrained(
-            vit_name_path, gpt2_name_path
-        )
     else:
-        vit_gpt2_model = FlaxViTGPT2LMForConditionalGeneration.from_pretrained(
-            vit_gpt2_name_path
         )
-    model = vit_gpt2_model
-    model.config.is_encoder_decoder = True
-    model.config.decoder_start_token_id = gpt2_config.bos_token_id
-    model.config.bos_token_id = gpt2_config.bos_token_id
-    model.config.eos_token_id = gpt2_config.eos_token_id
-    model.config.pad_token_id = gpt2_config.pad_token_id
     # Preprocessing the datasets.
     # We need to tokenize inputs and targets.
@@ -405,8 +563,26 @@ def main():
         logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
         return
-    image_file_column = 'image_file'
-    caption_column = 'fr'
     # Temporarily set max_target_length for training.
     max_target_length = data_args.max_target_length
@@ -414,29 +590,25 @@ def main():
     # In Flax, for seq2seq models we need to pass `decoder_input_ids`
     # as the Flax models don't accept `labels`, we need to prepare the decoder_input_ids here
     # for that dynamically import the `shift_tokens_right` function from the model file
-    model_module = __import__(vit_gpt2_model.__module__, fromlist=["shift_tokens_right"])
-    shift_tokens_right_fn = getattr(model_module, "shift_tokens_right")
     # Setting padding="max_length" as we need fixed length inputs for jitted functions
     def preprocess_function(examples):
-        _pixel_values = []
-        _captions = []
-        for y, z in zip(examples[image_file_column], examples[caption_column]):
-            with Image.open(y) as image:
                 try:
                     encoder_inputs = feature_extractor(images=image, return_tensors="np")
                 except:
                     continue
-                x = encoder_inputs.pixel_values
-                _pixel_values.append(x)
-                _captions.append(z + ' ' + tokenizer.eos_token)
-        pixel_values = np.concatenate(_pixel_values)
-        targets = _captions
-        # Add eos_token!!
-        #targets = [x + ' ' + tokenizer.eos_token for x in targets]
         model_inputs = {}
         model_inputs['pixel_values'] = pixel_values
@@ -448,18 +620,13 @@ def main():
             )
         model_inputs["labels"] = labels["input_ids"]
-        #print(labels["input_ids"])
-        #print(gpt2_config.pad_token_id)
-        #rint(gpt2_config.bos_token_id)
         decoder_input_ids = shift_tokens_right_fn(
-            jnp.array(labels["input_ids"]), gpt2_config.pad_token_id, gpt2_config.bos_token_id
         )
-        model_inputs["input_ids"] = np.asarray(decoder_input_ids)
         # We need decoder_attention_mask so we can ignore pad tokens from loss
-        model_inputs["attention_mask"] = labels["attention_mask"]
         return model_inputs
@@ -469,7 +636,6 @@ def main():
         train_dataset = dataset["train"]
         if data_args.max_train_samples is not None:
             train_dataset = train_dataset.select(range(data_args.max_train_samples))
         train_dataset = train_dataset.map(
             preprocess_function,
             batched=True,
@@ -604,7 +770,7 @@ def main():
     )
     # Setup train state
-    state = TrainState.create(apply_fn=vit_gpt2_model.__call__, params=vit_gpt2_model.params, tx=adamw, dropout_rng=dropout_rng)
     # label smoothed cross entropy
     def loss_fn(logits, labels, padding_mask, label_smoothing_factor=0.0):
@@ -635,7 +801,7 @@ def main():
         def compute_loss(params):
             labels = batch.pop("labels")
             logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
-            loss = loss_fn(logits, labels, batch["attention_mask"], label_smoothing_factor)
             return loss
         grad_fn = jax.value_and_grad(compute_loss)
@@ -653,7 +819,7 @@ def main():
     def eval_step(params, batch, label_smoothing_factor=0.0):
         labels = batch.pop("labels")
         logits = model(**batch, params=params, train=False)[0]
-        loss = loss_fn(logits, labels, batch["attention_mask"], label_smoothing_factor)
         # summarize metrics
         metrics = {"loss": loss}
@@ -669,15 +835,7 @@ def main():
     def generate_step(params, batch):
         model.params = params
-        # output_ids = model.generate(batch["pixel_values"], **gen_kwargs)
-        #encoder_outputs = model.encode(pixel_values=batch['pixel_values'])
-        #output_ids = model.generate(batch["input_ids"], attention_mask=batch["attention_mask"], encoder_outputs=encoder_outputs, **gen_kwargs)
-        # encoder_outputs = model.encode(pixel_values=batch['pixel_values'], params=params, train=False)
         output_ids = model.generate(batch['pixel_values'], **gen_kwargs)
         return output_ids.sequences
     # Create parallel version of the train and eval step
@@ -727,7 +885,6 @@ def main():
         with open(os.path.join(training_args.output_dir, f'report.txt'), 'a', encoding='UTF-8') as fp:
             fp.write(desc + '\n')
         # ======================== Evaluating ==============================
         eval_metrics = []
         eval_preds = []
@@ -768,7 +925,6 @@ def main():
         with open(os.path.join(training_args.output_dir, f'report.txt'), 'a', encoding='UTF-8') as fp:
             fp.write(desc + '\n')
         # Save metrics
         if has_tensorboard and jax.process_index() == 0:
             cur_step = epoch * (len(train_dataset) // train_batch_size)
@@ -816,17 +972,15 @@ def main():
             logger.info(desc)
             with open(os.path.join(training_args.output_dir, f'report.txt'), 'a', encoding='UTF-8') as fp:
                 fp.write(desc + '\n')
         # save checkpoint after each epoch and push checkpoint to the hub
         if jax.process_index() == 0:
             params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-            model.save_pretrained(
-                os.path.join(training_args.output_dir, f'ckpt_{epoch+1}'),
-                params=params,
-                push_to_hub=training_args.push_to_hub,
-                commit_message=f"Saving weights and logs of epoch {epoch+1}",
-            )
 if __name__ == "__main__":
     main()

 """
 # You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
 import logging
 import os
 import sys
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
+from huggingface_hub import Repository
 from transformers import (
     CONFIG_MAPPING,
+    FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING,
     AutoConfig,
+    AutoFeatureExtractor,
     AutoTokenizer,
     FlaxAutoModelForSeq2SeqLM,
     HfArgumentParser,
     TrainingArguments,
     is_tensorboard_available,
+    FlaxAutoModelForVision2Seq,
 )
+from transformers.file_utils import get_full_repo_name, is_offline_mode
 logger = logging.getLogger(__name__)
         nltk.download("punkt", quiet=True)
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+# Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
+def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = np.zeros_like(input_ids)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1]
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    return shifted_input_ids
 @dataclass
 class ModelArguments:
     """
             "Don't set if you want to train a model from scratch."
         },
     )
+    encoder_model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The encoder model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    decoder_model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The decoder model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
     model_type: Optional[str] = field(
         default=None,
         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
     )
+    encoder_model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a encoder model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    decoder_model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a decoder model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
     config_name: Optional[str] = field(
         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
     )
+    encoder_config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as encoder_model_name"}
+    )
+    decoder_config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as decoder_model_name"}
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained feature extractor_name name or path if not the same as encoder_model_name"}
+    )
     tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as decoder_model_name"}
     )
     cache_dir: Optional[str] = field(
         default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
     dataset_config_name: Optional[str] = field(
         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
+    data_dir: Optional[str] = field(
+        default=None, metadata={"help": "The data directory of the dataset to use (via the datasets library)."}
+    )
+    image_column: Optional[str] = field(
         default=None,
+        metadata={"help": "The name of the column in the datasets containing the full image file paths (for image captioning)."},
     )
+    caption_column: Optional[str] = field(
         default=None,
+        metadata={"help": "The name of the column in the datasets containing the image captions (for image captioning)."},
     )
     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
     validation_file: Optional[str] = field(
         default=None,
         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
     )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input predict data file to do prediction on (a text file)."},
+    )
     max_source_length: Optional[int] = field(
         default=1024,
         metadata={
         default=None,
         metadata={"help": "The number of processes to use for the preprocessing."},
     )
     predict_with_generate: bool = field(
         default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
     )
             self.val_max_target_length = self.max_target_length
+image_captioning_name_mapping = {
+    "image_caption_dataset.py": ("image_file", "caption"),
 }
     # Set the verbosity to info of the Transformers logger (on main process only):
     logger.info(f"Training/evaluation parameters {training_args}")
+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        repo = Repository(training_args.output_dir, clone_from=repo_name)
     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
     # (the dataset will be downloaded automatically from the datasets Hub).
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False, data_dir=data_args.data_dir
         )
     else:
         data_files = {}
         if data_args.test_file is not None:
             data_files["test"] = data_args.test_file
             extension = data_args.test_file.split(".")[-1]
+        # TODO: Check
+        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, data_dir=data_args.data_dir)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    encoder_cache_dir, decoder_cache_dir = None, None
+    if model_args.cache_dir:
+        encoder_cache_dir = os.path.join(model_args.cache_dir, "encoder")
+        decoder_cache_dir = os.path.join(model_args.cache_dir, "decoder")
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    elif getattr(CONFIG_MAPPING[model_args.model_type], "from_encoder_decoder_configs", None):
+        config_class = CONFIG_MAPPING[model_args.model_type]
+        if model_args.encoder_config_name:
+            encoder_config = AutoConfig.from_pretrained(model_args.encoder_config_name, cache_dir=encoder_cache_dir)
+        elif model_args.encoder_model_name_or_path:
+            encoder_config = AutoConfig.from_pretrained(model_args.encoder_model_name_or_path, cache_dir=encoder_cache_dir)
+        else:
+            encoder_config = CONFIG_MAPPING[model_args.encoder_model_type]()
+            logger.warning("You are instantiating a new config instance from scratch for the encoder.")
+        if model_args.decoder_config_name:
+            decoder_config = AutoConfig.from_pretrained(model_args.decoder_config_name, cache_dir=decoder_cache_dir)
+        elif model_args.decoder_model_name_or_path:
+            decoder_config = AutoConfig.from_pretrained(model_args.decoder_model_name_or_path, cache_dir=decoder_cache_dir)
+        else:
+            decoder_config = CONFIG_MAPPING[model_args.decoder_model_type]()
+            logger.warning("You are instantiating a new config instance from scratch for the decoder.")
+        logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
+        decoder_config.is_decoder = True
+        decoder_config.add_cross_attention = True
+        config = config_class.from_encoder_decoder_configs(encoder_config, decoder_config)
     else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    decoder_start_token_id = getattr(config, "decoder_start_token_id", None)
+    if not decoder_start_token_id and getattr(config, "decoder", None):
+        decoder_start_token_id = getattr(config.decoder, "decoder_start_token_id", None)
+    bos_token_id = getattr(config, "bos_token_id", None)
+    if not bos_token_id and getattr(config, "decoder", None):
+        bos_token_id = getattr(config.decoder, "bos_token_id", None)
+    eos_token_id = getattr(config, "eos_token_id", None)
+    if not eos_token_id and getattr(config, "decoder", None):
+        eos_token_id = getattr(config.decoder, "eos_token_id", None)
+    pad_token_id = getattr(config, "pad_token_id", None)
+    if not pad_token_id and getattr(config, "decoder", None):
+        pad_token_id = getattr(config.decoder, "pad_token_id", None)
+    if decoder_start_token_id is None:
+        decoder_start_token_id = bos_token_id
+    if pad_token_id is None:
+        pad_token_id = eos_token_id
+    config.decoder_start_token_id = decoder_start_token_id
+    config.bos_token_id = bos_token_id
+    config.eos_token_id = eos_token_id
+    config.pad_token_id = pad_token_id
+    if getattr(config, "decoder", None):
+        config.decoder.decoder_start_token_id = decoder_start_token_id
+        config.decoder.bos_token_id = bos_token_id
+        config.decoder.eos_token_id = eos_token_id
+        config.decoder.pad_token_id = pad_token_id
+    feature_extractor = None
+    if model_args.feature_extractor_name:
+        feature_extractor = AutoFeatureExtractor.from_pretrained(
+            model_args.feature_extractor_name, cache_dir=model_args.cache_dir,
         )
+    elif model_args.model_name_or_path:
+        try:
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                model_args.model_name_or_path, cache_dir=model_args.cache_dir
+            )
+        except ValueError as e:
+            logger.warning(e)
+    if not feature_extractor:
+        if model_args.encoder_model_name_or_path:
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                model_args.encoder_model_name_or_path, cache_dir=model_args.cache_dir
+            )
+        else:
+            raise ValueError(
+                "You are instantiating a new feature extractor from scratch. This is not supported by this script."
+                "You can do it from another script, save it, and load it from here, using --feature_extractor_name."
+            )
+    tokenizer = None
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    elif model_args.model_name_or_path:
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+            )
+        except ValueError as e:
+            logger.warning(e)
+    if not tokenizer:
+        if model_args.decoder_model_name_or_path:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_args.decoder_model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+            )
+        else:
+            raise ValueError(
+                "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+                "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+            )
+    tokenizer.pad_token = tokenizer.convert_ids_to_tokens(config.pad_token_id)
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForVision2Seq.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    elif model_args.encoder_model_name_or_path and model_args.decoder_model_name_or_path:
+        model_class = FlaxAutoModelForVision2Seq.from_config(config).__class__
+        model = model_class.from_encoder_decoder_pretrained(
+            model_args.encoder_model_name_or_path,
+            model_args.decoder_model_name_or_path,
+            encoder_config=config.encoder,
+            decoder_config=config.decoder,
+            encoder_seed=training_args.seed,
+            decoder_seed=training_args.seed,
+            encoder_dtype=getattr(jnp, model_args.dtype),
+            decoder_dtype=getattr(jnp, model_args.dtype),
+        )
+        # Set `encoder-decoder` (top-level) specific config
+        model.config.decoder_start_token_id = decoder_start_token_id
+        model.config.bos_token_id = bos_token_id
+        model.config.eos_token_id = eos_token_id
+        model.config.pad_token_id = pad_token_id
+    else:
+        model = FlaxAutoModelForVision2Seq.from_config(
+            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
     # Preprocessing the datasets.
     # We need to tokenize inputs and targets.
         logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
         return
+    # Get the column names for input/target.
+    dataset_columns = image_captioning_name_mapping.get(data_args.dataset_name, None)
+    if data_args.image_column is None:
+        assert dataset_columns is not None
+        image_column = dataset_columns[0]
+    else:
+        image_column = data_args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{data_args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if data_args.caption_column is None:
+        assert dataset_columns is not None
+        caption_column = dataset_columns[1]
+    else:
+        caption_column = data_args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{data_args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
     # Temporarily set max_target_length for training.
     max_target_length = data_args.max_target_length
     # In Flax, for seq2seq models we need to pass `decoder_input_ids`
     # as the Flax models don't accept `labels`, we need to prepare the decoder_input_ids here
     # for that dynamically import the `shift_tokens_right` function from the model file
+    model_module = __import__(model.__module__, fromlist=["shift_tokens_right"])
+    shift_tokens_right_fn = getattr(model_module, "shift_tokens_right", shift_tokens_right)
     # Setting padding="max_length" as we need fixed length inputs for jitted functions
     def preprocess_function(examples):
+        pixel_values = []
+        captions = []
+        for image_file, caption in zip(examples[image_column], examples[caption_column]):
+            with Image.open(image_file) as image:
                 try:
                     encoder_inputs = feature_extractor(images=image, return_tensors="np")
                 except:
                     continue
+                pixel_values.append(encoder_inputs.pixel_values)
+                captions.append(caption + ' ' + tokenizer.eos_token)
+        pixel_values = np.concatenate(pixel_values)
+        targets = captions
         model_inputs = {}
         model_inputs['pixel_values'] = pixel_values
             )
         model_inputs["labels"] = labels["input_ids"]
         decoder_input_ids = shift_tokens_right_fn(
+            jnp.array(labels["input_ids"]), config.pad_token_id, config.decoder_start_token_id
         )
+        model_inputs["decoder_input_ids"] = np.asarray(decoder_input_ids)
         # We need decoder_attention_mask so we can ignore pad tokens from loss
+        model_inputs["decoder_attention_mask"] = labels["attention_mask"]
         return model_inputs
         train_dataset = dataset["train"]
         if data_args.max_train_samples is not None:
             train_dataset = train_dataset.select(range(data_args.max_train_samples))
         train_dataset = train_dataset.map(
             preprocess_function,
             batched=True,
     )
     # Setup train state
+    state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng)
     # label smoothed cross entropy
     def loss_fn(logits, labels, padding_mask, label_smoothing_factor=0.0):
         def compute_loss(params):
             labels = batch.pop("labels")
             logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            loss = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor)
             return loss
         grad_fn = jax.value_and_grad(compute_loss)
     def eval_step(params, batch, label_smoothing_factor=0.0):
         labels = batch.pop("labels")
         logits = model(**batch, params=params, train=False)[0]
+        loss = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor)
         # summarize metrics
         metrics = {"loss": loss}
     def generate_step(params, batch):
         model.params = params
         output_ids = model.generate(batch['pixel_values'], **gen_kwargs)
         return output_ids.sequences
     # Create parallel version of the train and eval step
         with open(os.path.join(training_args.output_dir, f'report.txt'), 'a', encoding='UTF-8') as fp:
             fp.write(desc + '\n')
         # ======================== Evaluating ==============================
         eval_metrics = []
         eval_preds = []
         with open(os.path.join(training_args.output_dir, f'report.txt'), 'a', encoding='UTF-8') as fp:
             fp.write(desc + '\n')
         # Save metrics
         if has_tensorboard and jax.process_index() == 0:
             cur_step = epoch * (len(train_dataset) // train_batch_size)
             logger.info(desc)
             with open(os.path.join(training_args.output_dir, f'report.txt'), 'a', encoding='UTF-8') as fp:
                 fp.write(desc + '\n')
         # save checkpoint after each epoch and push checkpoint to the hub
         if jax.process_index() == 0:
             params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+            model.save_pretrained(os.path.join(training_args.output_dir, f'ckpt_{epoch+1}'), params=params)
+            tokenizer.save_pretrained(training_args.output_dir)
+            if training_args.push_to_hub:
+                repo.push_to_hub(commit_message=f"Saving weights and logs of epoch {epoch}", blocking=False)
 if __name__ == "__main__":
     main()

run_summarization_flax.py CHANGED Viewed

@@ -32,6 +32,7 @@ import nltk  # Here to have a nice missing dependency error message early on
 import numpy as np
 from datasets import Dataset, load_dataset, load_metric
 from tqdm import tqdm
 import jax
 import jax.numpy as jnp
@@ -45,13 +46,15 @@ from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_ke
 from huggingface_hub import Repository
 from transformers import (
     CONFIG_MAPPING,
-    FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
     AutoConfig,
     AutoTokenizer,
     FlaxAutoModelForSeq2SeqLM,
     HfArgumentParser,
     TrainingArguments,
     is_tensorboard_available,
 )
 from transformers.file_utils import get_full_repo_name, is_offline_mode
@@ -69,10 +72,23 @@ except (LookupError, OSError):
         nltk.download("punkt", quiet=True)
-MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 @dataclass
 class ModelArguments:
     """
@@ -86,15 +102,46 @@ class ModelArguments:
             "Don't set if you want to train a model from scratch."
         },
     )
     model_type: Optional[str] = field(
         default=None,
         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
     )
     config_name: Optional[str] = field(
         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
     )
     tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
     )
     cache_dir: Optional[str] = field(
         default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
@@ -123,13 +170,16 @@ class DataTrainingArguments:
     dataset_config_name: Optional[str] = field(
         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
-    text_column: Optional[str] = field(
         default=None,
-        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
     )
-    summary_column: Optional[str] = field(
         default=None,
-        metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
     )
     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
     validation_file: Optional[str] = field(
@@ -188,9 +238,6 @@ class DataTrainingArguments:
         default=None,
         metadata={"help": "The number of processes to use for the preprocessing."},
     )
-    source_prefix: Optional[str] = field(
-        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
-    )
     predict_with_generate: bool = field(
         default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
     )
@@ -219,18 +266,8 @@ class DataTrainingArguments:
             self.val_max_target_length = self.max_target_length
-summarization_name_mapping = {
-    "amazon_reviews_multi": ("review_body", "review_title"),
-    "big_patent": ("description", "abstract"),
-    "cnn_dailymail": ("article", "highlights"),
-    "orange_sum": ("text", "summary"),
-    "pn_summary": ("article", "summary"),
-    "psc": ("extract_text", "summary_text"),
-    "samsum": ("dialogue", "summary"),
-    "thaisum": ("body", "summary"),
-    "xglue": ("news_body", "news_title"),
-    "xsum": ("document", "summary"),
-    "wiki_summary": ("article", "highlights"),
 }
@@ -354,7 +391,7 @@ def main():
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
-            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False
         )
     else:
         data_files = {}
@@ -367,48 +404,153 @@ def main():
         if data_args.test_file is not None:
             data_files["test"] = data_args.test_file
             extension = data_args.test_file.split(".")[-1]
-        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
     # Load pretrained model and tokenizer
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
         )
     elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
-        )
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
     if model_args.model_name_or_path:
-        model = FlaxAutoModelForSeq2SeqLM.from_pretrained(
             model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
     else:
-        model = FlaxAutoModelForSeq2SeqLM.from_config(
             config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
-    if model.config.decoder_start_token_id is None:
-        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
-    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
     # Preprocessing the datasets.
     # We need to tokenize inputs and targets.
     if training_args.do_train:
@@ -422,22 +564,24 @@ def main():
         return
     # Get the column names for input/target.
-    dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None)
-    if data_args.text_column is None:
-        text_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
     else:
-        text_column = data_args.text_column
-        if text_column not in column_names:
             raise ValueError(
-                f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}"
             )
-    if data_args.summary_column is None:
-        summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
     else:
-        summary_column = data_args.summary_column
-        if summary_column not in column_names:
             raise ValueError(
-                f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}"
             )
     # Temporarily set max_target_length for training.
@@ -446,17 +590,28 @@ def main():
     # In Flax, for seq2seq models we need to pass `decoder_input_ids`
     # as the Flax models don't accept `labels`, we need to prepare the decoder_input_ids here
     # for that dynamically import the `shift_tokens_right` function from the model file
-    model_module = __import__(model.__module__, fromlist=["shift_tokens_tight"])
-    shift_tokens_right_fn = getattr(model_module, "shift_tokens_right")
     # Setting padding="max_length" as we need fixed length inputs for jitted functions
     def preprocess_function(examples):
-        inputs = examples[text_column]
-        targets = examples[summary_column]
-        inputs = [prefix + inp for inp in inputs]
-        model_inputs = tokenizer(
-            inputs, max_length=data_args.max_source_length, padding="max_length", truncation=True, return_tensors="np"
-        )
         # Setup the tokenizer for targets
         with tokenizer.as_target_tokenizer():
@@ -680,7 +835,7 @@ def main():
     def generate_step(params, batch):
         model.params = params
-        output_ids = model.generate(batch["input_ids"], attention_mask=batch["attention_mask"], **gen_kwargs)
         return output_ids.sequences
     # Create parallel version of the train and eval step
@@ -723,9 +878,12 @@ def main():
         train_metric = unreplicate(train_metric)
-        epochs.write(
-            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
-        )
         # ======================== Evaluating ==============================
         eval_metrics = []
@@ -763,55 +921,62 @@ def main():
         desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']} | {rouge_desc})"
         epochs.write(desc)
         epochs.desc = desc
         # Save metrics
         if has_tensorboard and jax.process_index() == 0:
             cur_step = epoch * (len(train_dataset) // train_batch_size)
             write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step)
-    # ======================== Prediction loop ==============================
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-        pred_metrics = []
-        pred_generations = []
-        pred_labels = []
-        pred_loader = data_loader(input_rng, predict_dataset, eval_batch_size)
-        pred_steps = len(predict_dataset) // eval_batch_size
-        for _ in tqdm(range(pred_steps), desc="Predicting...", position=2, leave=False):
-            # Model forward
-            batch = next(pred_loader)
-            labels = batch["labels"]
-            metrics = p_eval_step(state.params, batch)
-            pred_metrics.append(metrics)
-            # generation
             if data_args.predict_with_generate:
-                generated_ids = p_generate_step(state.params, batch)
-                pred_generations.extend(jax.device_get(generated_ids.reshape(-1, gen_kwargs["max_length"])))
-                pred_labels.extend(jax.device_get(labels.reshape(-1, labels.shape[-1])))
-        # normalize prediction metrics
-        pred_metrics = get_metrics(pred_metrics)
-        pred_metrics = jax.tree_map(jnp.mean, pred_metrics)
-        # compute ROUGE metrics
-        rouge_desc = ""
-        if data_args.predict_with_generate:
-            rouge_metrics = compute_metrics(pred_generations, pred_labels)
-            pred_metrics.update(rouge_metrics)
-            rouge_desc = " ".join([f"Predict {key}: {value} |" for key, value in rouge_metrics.items()])
-        # Print metrics
-        desc = f"Predict Loss: {pred_metrics['loss']} | {rouge_desc})"
-        logger.info(desc)
         # save checkpoint after each epoch and push checkpoint to the hub
         if jax.process_index() == 0:
             params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-            model.save_pretrained(training_args.output_dir, params=params)
             tokenizer.save_pretrained(training_args.output_dir)
             if training_args.push_to_hub:
                 repo.push_to_hub(commit_message=f"Saving weights and logs of epoch {epoch}", blocking=False)

 import numpy as np
 from datasets import Dataset, load_dataset, load_metric
 from tqdm import tqdm
+from PIL import Image
 import jax
 import jax.numpy as jnp
 from huggingface_hub import Repository
 from transformers import (
     CONFIG_MAPPING,
+    FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING,
     AutoConfig,
+    AutoFeatureExtractor,
     AutoTokenizer,
     FlaxAutoModelForSeq2SeqLM,
     HfArgumentParser,
     TrainingArguments,
     is_tensorboard_available,
+    FlaxAutoModelForVision2Seq,
 )
 from transformers.file_utils import get_full_repo_name, is_offline_mode
         nltk.download("punkt", quiet=True)
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+# Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
+def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = np.zeros_like(input_ids)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1]
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    return shifted_input_ids
 @dataclass
 class ModelArguments:
     """
             "Don't set if you want to train a model from scratch."
         },
     )
+    encoder_model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The encoder model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    decoder_model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The decoder model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
     model_type: Optional[str] = field(
         default=None,
         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
     )
+    encoder_model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a encoder model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    decoder_model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a decoder model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
     config_name: Optional[str] = field(
         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
     )
+    encoder_config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as encoder_model_name"}
+    )
+    decoder_config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as decoder_model_name"}
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained feature extractor_name name or path if not the same as encoder_model_name"}
+    )
     tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as decoder_model_name"}
     )
     cache_dir: Optional[str] = field(
         default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
     dataset_config_name: Optional[str] = field(
         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
+    data_dir: Optional[str] = field(
+        default=None, metadata={"help": "The data directory of the dataset to use (via the datasets library)."}
+    )
+    image_column: Optional[str] = field(
         default=None,
+        metadata={"help": "The name of the column in the datasets containing the full image file paths (for image captioning)."},
     )
+    caption_column: Optional[str] = field(
         default=None,
+        metadata={"help": "The name of the column in the datasets containing the image captions (for image captioning)."},
     )
     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
     validation_file: Optional[str] = field(
         default=None,
         metadata={"help": "The number of processes to use for the preprocessing."},
     )
     predict_with_generate: bool = field(
         default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
     )
             self.val_max_target_length = self.max_target_length
+image_captioning_name_mapping = {
+    "image_caption_dataset.py": ("image_file", "caption"),
 }
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False, data_dir=data_args.data_dir
         )
     else:
         data_files = {}
         if data_args.test_file is not None:
             data_files["test"] = data_args.test_file
             extension = data_args.test_file.split(".")[-1]
+        # TODO: Check
+        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, data_dir=data_args.data_dir)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
     # Load pretrained model and tokenizer
+    encoder_cache_dir, decoder_cache_dir = None, None
+    if model_args.cache_dir:
+        encoder_cache_dir = os.path.join(model_args.cache_dir, "encoder")
+        decoder_cache_dir = os.path.join(model_args.cache_dir, "decoder")
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    elif getattr(CONFIG_MAPPING[model_args.model_type], "from_encoder_decoder_configs", None):
+        config_class = CONFIG_MAPPING[model_args.model_type]
+        if model_args.encoder_config_name:
+            encoder_config = AutoConfig.from_pretrained(model_args.encoder_config_name, cache_dir=encoder_cache_dir)
+        elif model_args.encoder_model_name_or_path:
+            encoder_config = AutoConfig.from_pretrained(model_args.encoder_model_name_or_path, cache_dir=encoder_cache_dir)
+        else:
+            encoder_config = CONFIG_MAPPING[model_args.encoder_model_type]()
+            logger.warning("You are instantiating a new config instance from scratch for the encoder.")
+        if model_args.decoder_config_name:
+            decoder_config = AutoConfig.from_pretrained(model_args.decoder_config_name, cache_dir=decoder_cache_dir)
+        elif model_args.decoder_model_name_or_path:
+            decoder_config = AutoConfig.from_pretrained(model_args.decoder_model_name_or_path, cache_dir=decoder_cache_dir)
+        else:
+            decoder_config = CONFIG_MAPPING[model_args.decoder_model_type]()
+            logger.warning("You are instantiating a new config instance from scratch for the decoder.")
+        logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
+        decoder_config.is_decoder = True
+        decoder_config.add_cross_attention = True
+        config = config_class.from_encoder_decoder_configs(encoder_config, decoder_config)
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
+    decoder_start_token_id = getattr(config, "decoder_start_token_id", None)
+    if not decoder_start_token_id and getattr(config, "decoder", None):
+        decoder_start_token_id = getattr(config.decoder, "decoder_start_token_id", None)
+    bos_token_id = getattr(config, "bos_token_id", None)
+    if not bos_token_id and getattr(config, "decoder", None):
+        bos_token_id = getattr(config.decoder, "bos_token_id", None)
+    eos_token_id = getattr(config, "eos_token_id", None)
+    if not eos_token_id and getattr(config, "decoder", None):
+        eos_token_id = getattr(config.decoder, "eos_token_id", None)
+    pad_token_id = getattr(config, "pad_token_id", None)
+    if not pad_token_id and getattr(config, "decoder", None):
+        pad_token_id = getattr(config.decoder, "pad_token_id", None)
+    if decoder_start_token_id is None:
+        decoder_start_token_id = bos_token_id
+    if pad_token_id is None:
+        pad_token_id = eos_token_id
+    config.decoder_start_token_id = decoder_start_token_id
+    config.bos_token_id = bos_token_id
+    config.eos_token_id = eos_token_id
+    config.pad_token_id = pad_token_id
+    if getattr(config, "decoder", None):
+        config.decoder.decoder_start_token_id = decoder_start_token_id
+        config.decoder.bos_token_id = bos_token_id
+        config.decoder.eos_token_id = eos_token_id
+        config.decoder.pad_token_id = pad_token_id
+    feature_extractor = None
+    if model_args.feature_extractor_name:
+        feature_extractor = AutoFeatureExtractor.from_pretrained(
+            model_args.feature_extractor_name, cache_dir=model_args.cache_dir,
+        )
+    elif model_args.model_name_or_path:
+        try:
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                model_args.model_name_or_path, cache_dir=model_args.cache_dir
+            )
+        except ValueError as e:
+            logger.warning(e)
+    if not feature_extractor:
+        if model_args.encoder_model_name_or_path:
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                model_args.encoder_model_name_or_path, cache_dir=model_args.cache_dir
+            )
+        else:
+            raise ValueError(
+                "You are instantiating a new feature extractor from scratch. This is not supported by this script."
+                "You can do it from another script, save it, and load it from here, using --feature_extractor_name."
+            )
+    tokenizer = None
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
         )
     elif model_args.model_name_or_path:
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+            )
+        except ValueError as e:
+            logger.warning(e)
+    if not tokenizer:
+        if model_args.decoder_model_name_or_path:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_args.decoder_model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+            )
+        else:
+            raise ValueError(
+                "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+                "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+            )
+    tokenizer.pad_token = tokenizer.convert_ids_to_tokens(config.pad_token_id)
     if model_args.model_name_or_path:
+        model = FlaxAutoModelForVision2Seq.from_pretrained(
             model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
+    elif model_args.encoder_model_name_or_path and model_args.decoder_model_name_or_path:
+        model_class = FlaxAutoModelForVision2Seq.from_config(config).__class__
+        model = model_class.from_encoder_decoder_pretrained(
+            model_args.encoder_model_name_or_path,
+            model_args.decoder_model_name_or_path,
+            encoder_config=config.encoder,
+            decoder_config=config.decoder,
+            encoder_seed=training_args.seed,
+            decoder_seed=training_args.seed,
+            encoder_dtype=getattr(jnp, model_args.dtype),
+            decoder_dtype=getattr(jnp, model_args.dtype),
+        )
+        # Set `encoder-decoder` (top-level) specific config
+        model.config.decoder_start_token_id = decoder_start_token_id
+        model.config.bos_token_id = bos_token_id
+        model.config.eos_token_id = eos_token_id
+        model.config.pad_token_id = pad_token_id
     else:
+        model = FlaxAutoModelForVision2Seq.from_config(
             config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
     # Preprocessing the datasets.
     # We need to tokenize inputs and targets.
     if training_args.do_train:
         return
     # Get the column names for input/target.
+    dataset_columns = image_captioning_name_mapping.get(data_args.dataset_name, None)
+    if data_args.image_column is None:
+        assert dataset_columns is not None
+        image_column = dataset_columns[0]
     else:
+        image_column = data_args.image_column
+        if image_column not in column_names:
             raise ValueError(
+                f"--image_column' value '{data_args.image_column}' needs to be one of: {', '.join(column_names)}"
             )
+    if data_args.caption_column is None:
+        assert dataset_columns is not None
+        caption_column = dataset_columns[1]
     else:
+        caption_column = data_args.caption_column
+        if caption_column not in column_names:
             raise ValueError(
+                f"--caption_column' value '{data_args.caption_column}' needs to be one of: {', '.join(column_names)}"
             )
     # Temporarily set max_target_length for training.
     # In Flax, for seq2seq models we need to pass `decoder_input_ids`
     # as the Flax models don't accept `labels`, we need to prepare the decoder_input_ids here
     # for that dynamically import the `shift_tokens_right` function from the model file
+    model_module = __import__(model.__module__, fromlist=["shift_tokens_right"])
+    shift_tokens_right_fn = getattr(model_module, "shift_tokens_right", shift_tokens_right)
     # Setting padding="max_length" as we need fixed length inputs for jitted functions
     def preprocess_function(examples):
+        pixel_values = []
+        captions = []
+        for image_file, caption in zip(examples[image_column], examples[caption_column]):
+            with Image.open(image_file) as image:
+                try:
+                    encoder_inputs = feature_extractor(images=image, return_tensors="np")
+                except:
+                    continue
+                pixel_values.append(encoder_inputs.pixel_values)
+                captions.append(caption + ' ' + tokenizer.eos_token)
+        pixel_values = np.concatenate(pixel_values)
+        targets = captions
+        model_inputs = {}
+        model_inputs['pixel_values'] = pixel_values
         # Setup the tokenizer for targets
         with tokenizer.as_target_tokenizer():
     def generate_step(params, batch):
         model.params = params
+        output_ids = model.generate(batch['pixel_values'], **gen_kwargs)
         return output_ids.sequences
     # Create parallel version of the train and eval step
         train_metric = unreplicate(train_metric)
+        desc = f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+        epochs.write(desc)
+        epochs.desc = desc
+        logger.info(desc)
+        with open(os.path.join(training_args.output_dir, f'report.txt'), 'a', encoding='UTF-8') as fp:
+            fp.write(desc + '\n')
         # ======================== Evaluating ==============================
         eval_metrics = []
         desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']} | {rouge_desc})"
         epochs.write(desc)
         epochs.desc = desc
+        logger.info(desc)
+        with open(os.path.join(training_args.output_dir, f'report.txt'), 'a', encoding='UTF-8') as fp:
+            fp.write(desc + '\n')
         # Save metrics
         if has_tensorboard and jax.process_index() == 0:
             cur_step = epoch * (len(train_dataset) // train_batch_size)
             write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step)
+        # ======================== Prediction loop ==============================
+        if training_args.do_predict:
+            logger.info("*** Predict ***")
+            pred_metrics = []
+            pred_generations = []
+            pred_labels = []
+            pred_loader = data_loader(input_rng, predict_dataset, eval_batch_size)
+            pred_steps = len(predict_dataset) // eval_batch_size
+            for _ in tqdm(range(pred_steps), desc="Predicting...", position=2, leave=False):
+                # Model forward
+                batch = next(pred_loader)
+                labels = batch["labels"]
+                metrics = p_eval_step(state.params, batch)
+                pred_metrics.append(metrics)
+                # generation
+                if data_args.predict_with_generate:
+                    generated_ids = p_generate_step(state.params, batch)
+                    pred_generations.extend(jax.device_get(generated_ids.reshape(-1, gen_kwargs["max_length"])))
+                    pred_labels.extend(jax.device_get(labels.reshape(-1, labels.shape[-1])))
+            # normalize prediction metrics
+            pred_metrics = get_metrics(pred_metrics)
+            pred_metrics = jax.tree_map(jnp.mean, pred_metrics)
+            # compute ROUGE metrics
+            rouge_desc = ""
             if data_args.predict_with_generate:
+                rouge_metrics = compute_metrics(pred_generations, pred_labels)
+                pred_metrics.update(rouge_metrics)
+                rouge_desc = " ".join([f"Predict {key}: {value} |" for key, value in rouge_metrics.items()])
+            # Print metrics
+            desc = f"Predict Loss: {pred_metrics['loss']} | {rouge_desc})"
+            epochs.write(desc)
+            epochs.desc = desc
+            logger.info(desc)
+            with open(os.path.join(training_args.output_dir, f'report.txt'), 'a', encoding='UTF-8') as fp:
+                fp.write(desc + '\n')
         # save checkpoint after each epoch and push checkpoint to the hub
         if jax.process_index() == 0:
             params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+            model.save_pretrained(os.path.join(training_args.output_dir, f'ckpt_{epoch+1}'), params=params)
             tokenizer.save_pretrained(training_args.output_dir)
             if training_args.push_to_hub:
                 repo.push_to_hub(commit_message=f"Saving weights and logs of epoch {epoch}", blocking=False)