Spaces:

flax-community
/

dalle-mini

Running

App Files Files Community

boris commited on Nov 26, 2021

Commit

2816f98

•

2 Parent(s): 6aa30f5 c55ecf8

Merge pull request #107 from borisdayma/feat-seq2seq

Browse files

Files changed (4) hide show

dalle_mini/model.py +25 -30
dev/inference/samples.txt +4 -9
dev/seq2seq/run_seq2seq_flax.py +314 -293
setup.cfg +2 -0

dalle_mini/model.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import jax
 import flax.linen as nn
@@ -7,60 +6,56 @@ from transformers.models.bart.modeling_flax_bart import (
     FlaxBartForConditionalGenerationModule,
     FlaxBartForConditionalGeneration,
     FlaxBartEncoder,
-    FlaxBartDecoder
 )
 from transformers import BartConfig
-# Model hyperparameters, for convenience
-OUTPUT_VOCAB_SIZE = 16384 + 1  # encoded image token space + 1 for bos
-OUTPUT_LENGTH = 256 + 1  # number of encoded tokens + 1 for bos
-BOS_TOKEN_ID = 16384
-BASE_MODEL = 'facebook/bart-large-cnn'  # we currently have issues with bart-large
 class CustomFlaxBartModule(FlaxBartModule):
     def setup(self):
-        # check config is valid, otherwise set default values
-        self.config.vocab_size_output = getattr(self.config, 'vocab_size_output', OUTPUT_VOCAB_SIZE)
-        self.config.max_position_embeddings_decoder = getattr(self.config, 'max_position_embeddings_decoder', OUTPUT_LENGTH)
         # we keep shared to easily load pre-trained weights
         self.shared = nn.Embed(
             self.config.vocab_size,
             self.config.d_model,
-            embedding_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),
-            dtype=self.dtype,
         )
         # a separate embedding is used for the decoder
         self.decoder_embed = nn.Embed(
-            self.config.vocab_size_output,
             self.config.d_model,
-            embedding_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),
-            dtype=self.dtype,
         )
-        self.encoder = FlaxBartEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared)
         # the decoder has a different config
         decoder_config = BartConfig(self.config.to_dict())
-        decoder_config.max_position_embeddings = self.config.max_position_embeddings_decoder
-        decoder_config.vocab_size = self.config.vocab_size_output
-        self.decoder = FlaxBartDecoder(decoder_config, dtype=self.dtype, embed_tokens=self.decoder_embed)
-class CustomFlaxBartForConditionalGenerationModule(FlaxBartForConditionalGenerationModule):
-    def setup(self):
-        # check config is valid, otherwise set default values
-        self.config.vocab_size_output = getattr(self.config, 'vocab_size_output', OUTPUT_VOCAB_SIZE)
         self.model = CustomFlaxBartModule(config=self.config, dtype=self.dtype)
         self.lm_head = nn.Dense(
-            self.config.vocab_size_output,
             use_bias=False,
-            dtype=self.dtype,
-            kernel_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),
         )
-        self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, self.config.vocab_size_output))
 class CustomFlaxBartForConditionalGeneration(FlaxBartForConditionalGeneration):
     module_class = CustomFlaxBartForConditionalGenerationModule

 import jax
 import flax.linen as nn
     FlaxBartForConditionalGenerationModule,
     FlaxBartForConditionalGeneration,
     FlaxBartEncoder,
+    FlaxBartDecoder,
 )
 from transformers import BartConfig
 class CustomFlaxBartModule(FlaxBartModule):
     def setup(self):
         # we keep shared to easily load pre-trained weights
         self.shared = nn.Embed(
             self.config.vocab_size,
             self.config.d_model,
+            embedding_init=jax.nn.initializers.normal(self.config.init_std),
         )
         # a separate embedding is used for the decoder
         self.decoder_embed = nn.Embed(
+            self.config.image_vocab_size + 1,
             self.config.d_model,
+            embedding_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.encoder = FlaxBartEncoder(
+            self.config, dtype=self.dtype, embed_tokens=self.shared
         )
         # the decoder has a different config
+        # TODO: should not be needed once we have custom config/module
         decoder_config = BartConfig(self.config.to_dict())
+        decoder_config.max_position_embeddings = (
+            self.config.image_length + 1  # image tokens + BOS
+        )
+        decoder_config.vocab_size = self.config.image_vocab_size + 1
+        self.decoder = FlaxBartDecoder(
+            decoder_config, dtype=self.dtype, embed_tokens=self.decoder_embed
+        )
+class CustomFlaxBartForConditionalGenerationModule(
+    FlaxBartForConditionalGenerationModule
+):
+    def setup(self):
         self.model = CustomFlaxBartModule(config=self.config, dtype=self.dtype)
         self.lm_head = nn.Dense(
+            self.config.image_vocab_size + 1,  # encoded image token space + 1 for bos
             use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
         )
+        self.final_logits_bias = self.param(
+            "final_logits_bias", self.bias_init, (1, self.config.image_vocab_size + 1)
+        )
 class CustomFlaxBartForConditionalGeneration(FlaxBartForConditionalGeneration):
     module_class = CustomFlaxBartForConditionalGenerationModule

dev/inference/samples.txt CHANGED Viewed

@@ -24,7 +24,6 @@ underwater cathedral
 a photo of a fantasy version of New York City
 a picture of fantasy kingdoms
 a volcano erupting next to San Francisco golden gate bridge
-big wave destroying a city
 Paris in a far future, futuristic Paris
 real painting of an alien from Monet
 the communist statue of liberty
@@ -54,16 +53,16 @@ a long line of green blocks on a beach at subset
 a long line of peaches on a beach at sunset
 a picture of a castle from minecraft
 a cute pikachu teapot
-an illustration of pikachu sitting on a bench
-mario is jumping over a zebra during the sunset
 famous anime hero
 star wars concept art
 a cartoon of a superhero bear
 an illustration of a cute skeleton wearing a blue hoodie
 illustration of a baby shark swimming around corals
 Cartoon of a carrot with big eyes
 logo of a robot wearing glasses and reading a book
-a cactus lifting weights
 illustration of a cactus lifting weigths
 logo of a cactus lifting weights
 a photo of a camera from the future
@@ -72,7 +71,6 @@ a collection of glasses is sitting on a table
 a painting of a capybara sitting on a mountain during fall in surrealist style
 a pentagonal green clock
 a pixel art illustration of an eagle sitting in a field in the afternoon
-a professional high-quality emoji of a lovestruck cup of boba
 a small red block sitting on a large green block
 a storefront that has the word 'openai' written on it
 a tatoo of a black broccoli
@@ -88,10 +86,7 @@ urinals are lined up in a jungle
 a muscular banana sitting upright on a bench smoking watching a banana on television, high definition photography
 a human face
 a person is holding a phone and a waterbottle, running a marathon
-a photograph of Ellen G. White
 Young woman riding her bike through the forest
-a portrait of a nightmare creature watching at you
-a white room full of a black substance
 the best soccer team of the world
 the best basketball team of the world
 the best football team of the world
@@ -100,6 +95,7 @@ sad, sadness
 the representation of infinity
 the end of the world
 the last sunrise on earth
 an avocado armchair
 an armchair in the shape of an avocado
 illustration of an avocado armchair
@@ -109,4 +105,3 @@ an avocado armchair flying into space
 a cute avocado armchair singing karaoke on stage in front of a crowd of strawberry shaped lamps
 an illustration of an avocado in a christmas sweater staring at its reflection in a mirror
 illustration of an avocado armchair getting married to a pineapple
-an illustration of an avocado in a beanie riding a motorcycle

 a photo of a fantasy version of New York City
 a picture of fantasy kingdoms
 a volcano erupting next to San Francisco golden gate bridge
 Paris in a far future, futuristic Paris
 real painting of an alien from Monet
 the communist statue of liberty
 a long line of peaches on a beach at sunset
 a picture of a castle from minecraft
 a cute pikachu teapot
+an illustration of pikachu sitting on a bench eating an ice cream
+mario is jumping over a zebra
 famous anime hero
 star wars concept art
 a cartoon of a superhero bear
 an illustration of a cute skeleton wearing a blue hoodie
 illustration of a baby shark swimming around corals
+an illustration of an avocado in a beanie riding a motorcycle
 Cartoon of a carrot with big eyes
 logo of a robot wearing glasses and reading a book
 illustration of a cactus lifting weigths
 logo of a cactus lifting weights
 a photo of a camera from the future
 a painting of a capybara sitting on a mountain during fall in surrealist style
 a pentagonal green clock
 a pixel art illustration of an eagle sitting in a field in the afternoon
 a small red block sitting on a large green block
 a storefront that has the word 'openai' written on it
 a tatoo of a black broccoli
 a muscular banana sitting upright on a bench smoking watching a banana on television, high definition photography
 a human face
 a person is holding a phone and a waterbottle, running a marathon
 Young woman riding her bike through the forest
 the best soccer team of the world
 the best basketball team of the world
 the best football team of the world
 the representation of infinity
 the end of the world
 the last sunrise on earth
+a portrait of a nightmare creature watching at you
 an avocado armchair
 an armchair in the shape of an avocado
 illustration of an avocado armchair
 a cute avocado armchair singing karaoke on stage in front of a crowd of strawberry shaped lamps
 an illustration of an avocado in a christmas sweater staring at its reflection in a mirror
 illustration of an avocado armchair getting married to a pineapple

dev/seq2seq/run_seq2seq_flax.py CHANGED Viewed

@@ -17,11 +17,11 @@
 Fine-tuning the library models for seq2seq, text to image.
 Script adapted from run_summarization_flax.py
 """
-# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
 import os
-import logging as pylogging  # To avoid collision with transformers.utils.logging
 import sys
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Callable, Optional
@@ -38,31 +38,21 @@ import optax
 import transformers
 from flax import jax_utils, traverse_util
 from flax.serialization import from_bytes, to_bytes
-import flax.linen as nn
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
 from transformers import (
     AutoTokenizer,
-    FlaxBartForConditionalGeneration,
     HfArgumentParser,
-    TrainingArguments,
 )
-from transformers.models.bart.modeling_flax_bart import *
 import wandb
 from dalle_mini.text import TextNormalizer
-logger = pylogging.getLogger(__name__)
-# Model hyperparameters, for convenience
-# TODO: the model has now it's own definition file and should be imported
-OUTPUT_VOCAB_SIZE = 16384 + 1  # encoded image token space + 1 for bos
-OUTPUT_LENGTH = 256 + 1  # number of encoded tokens + 1 for bos
-BOS_TOKEN_ID = 16384
-BASE_MODEL = "facebook/bart-large-cnn"  # we currently have issues with bart-large
 @dataclass
@@ -72,36 +62,36 @@ class ModelArguments:
     """
     model_name_or_path: Optional[str] = field(
-        default=BASE_MODEL,
         metadata={
             "help": "The model checkpoint for weights initialization."
             "Don't set if you want to train a model from scratch."
         },
     )
-    config_name: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "Pretrained config name or path if not the same as model_name"
-        },
     )
-    use_fast_tokenizer: bool = field(
-        default=True,
         metadata={
-            "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."
         },
     )
     dtype: Optional[str] = field(
         default="float32",
         metadata={
             "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
         },
     )
-    from_checkpoint: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Loads a pretrained wandb checkpoint. Use artifact reference."
-        },
-    )
 @dataclass
@@ -139,13 +129,11 @@ class DataTrainingArguments:
         default=False,
         metadata={"help": "Whether to stream the dataset."},
     )
-    len_train: Optional[int] = field(
-        default=None,
-        metadata={"help": "Length of training dataset, required for streaming"},
-    )
-    len_eval: Optional[int] = field(
-        default=None,
-        metadata={"help": "Length of validation dataset, required for streaming"},
     )
     max_source_length: Optional[int] = field(
         default=128,
@@ -154,26 +142,6 @@ class DataTrainingArguments:
             "than this will be truncated, sequences shorter will be padded."
         },
     )
-    no_decay: bool = field(
-        default=False,
-        metadata={"help": "Whether to use decay in the learning rate scheduler."},
-    )
-    max_target_length: Optional[int] = field(
-        default=OUTPUT_LENGTH,
-        metadata={
-            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        },
-    )
-    val_max_target_length: Optional[int] = field(
-        default=OUTPUT_LENGTH,
-        metadata={
-            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
-            "This argument is also used to override the `max_length` param of `model.generate`, which is used "
-            "during evaluation."
-        },
-    )
     max_train_samples: Optional[int] = field(
         default=None,
         metadata={
@@ -188,71 +156,144 @@ class DataTrainingArguments:
             "value if set."
         },
     )
-    normalize_text: bool = field(
-        default=False,
-        metadata={"help": "Normalize/Simplify text"},
-    )
     preprocessing_num_workers: Optional[int] = field(
-        default=80,  # ensure we have the same datasets cached data and avoid using too much space
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    source_prefix: Optional[str] = field(
         default=None,
         metadata={
-            "help": "A prefix to add before every source text (useful for T5 models)."
         },
     )
     overwrite_cache: bool = field(
         default=False,
-        metadata={"help": "Overwrite the cached training and evaluation sets"},
     )
-    log_interval: Optional[int] = field(
-        default=40,
-        metadata={"help": "Log frequency for metrics"},
     )
     log_model: bool = field(
         default=False,
-        metadata={"help": "Overwrite the cached training and evaluation sets"},
     )
-    save_model_steps: Optional[int] = field(
-        default=5000,  # about once every 1.5h in our experiments
         metadata={
-            "help": "For logging the model more frequently. Used only when `log_model` is set."
         },
     )
-    def __post_init__(self):
-        if self.dataset_repo_or_path is None:
-            raise ValueError("Need a dataset repository or path.")
-        if self.train_file is None or self.validation_file is None:
-            raise ValueError("Need training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in [
-                    "tsv",
-                    "csv",
-                    "json",
-                    "jsonl",
-                ], "`train_file` should be a tsv, csv or json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in [
-                    "tsv",
-                    "csv",
-                    "json",
-                    "jsonl",
-                ], "`validation_file` should be a tsv, csv or json file."
-        if self.val_max_target_length is None:
-            self.val_max_target_length = self.max_target_length
-        if self.streaming and (self.len_train is None or self.len_eval is None):
-            raise ValueError(
-                "Streaming requires providing length of training and validation datasets"
-            )
 class TrainState(train_state.TrainState):
     dropout_rng: jnp.ndarray = None
     def replicate(self):
         return jax_utils.replicate(self).replace(
@@ -264,81 +305,23 @@ class TrainState(train_state.TrainState):
         with (Path(artifact_dir) / "opt_state.msgpack").open("rb") as f:
             new_opt_state = from_bytes(self.opt_state, f.read())
-        # restore steps
         with (Path(artifact_dir) / "training_state.json").open("r") as f:
             training_state = json.load(f)
-        new_step = training_state["step"]
         # replace state
-        return self.replace(step=new_step, opt_state=new_opt_state)
-class CustomFlaxBartModule(FlaxBartModule):
-    def setup(self):
-        # check config is valid, otherwise set default values
-        self.config.vocab_size_output = getattr(
-            self.config, "vocab_size_output", OUTPUT_VOCAB_SIZE
-        )
-        self.config.max_position_embeddings_decoder = getattr(
-            self.config, "max_position_embeddings_decoder", OUTPUT_LENGTH
         )
-        # we keep shared to easily load pre-trained weights
-        self.shared = nn.Embed(
-            self.config.vocab_size,
-            self.config.d_model,
-            embedding_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),
-            dtype=self.dtype,
-        )
-        # a separate embedding is used for the decoder
-        self.decoder_embed = nn.Embed(
-            self.config.vocab_size_output,
-            self.config.d_model,
-            embedding_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),
-            dtype=self.dtype,
-        )
-        self.encoder = FlaxBartEncoder(
-            self.config, dtype=self.dtype, embed_tokens=self.shared
-        )
-        # the decoder has a different config
-        decoder_config = BartConfig(self.config.to_dict())
-        decoder_config.max_position_embeddings = (
-            self.config.max_position_embeddings_decoder
-        )
-        decoder_config.vocab_size = self.config.vocab_size_output
-        self.decoder = FlaxBartDecoder(
-            decoder_config, dtype=self.dtype, embed_tokens=self.decoder_embed
-        )
-class CustomFlaxBartForConditionalGenerationModule(
-    FlaxBartForConditionalGenerationModule
-):
-    def setup(self):
-        # check config is valid, otherwise set default values
-        self.config.vocab_size_output = getattr(
-            self.config, "vocab_size_output", OUTPUT_VOCAB_SIZE
-        )
-        self.model = CustomFlaxBartModule(config=self.config, dtype=self.dtype)
-        self.lm_head = nn.Dense(
-            self.config.vocab_size_output,
-            use_bias=False,
-            dtype=self.dtype,
-            kernel_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),
-        )
-        self.final_logits_bias = self.param(
-            "final_logits_bias", self.bias_init, (1, self.config.vocab_size_output)
-        )
-class CustomFlaxBartForConditionalGeneration(FlaxBartForConditionalGeneration):
-    module_class = CustomFlaxBartForConditionalGenerationModule
 def data_loader(
-    rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuffle: bool = False
 ):
     """
     Returns batches of size `batch_size` from truncated `dataset`, sharded over all local devices.
@@ -346,7 +329,7 @@ def data_loader(
     """
     steps_per_epoch = len(dataset) // batch_size
-    if shuffle:
         batch_idx = jax.random.permutation(rng, len(dataset))
     else:
         batch_idx = jnp.arange(len(dataset))
@@ -375,20 +358,20 @@ def data_loader_streaming(dataset: Dataset, batch_size: int):
 def create_learning_rate_fn(
-    train_ds_size: int,
-    train_batch_size: int,
-    num_train_epochs: int,
     num_warmup_steps: int,
     learning_rate: float,
-    no_decay: bool,
 ) -> Callable[[int], jnp.array]:
     """Returns a linear warmup, linear_decay learning rate function."""
-    steps_per_epoch = train_ds_size // train_batch_size
-    num_train_steps = steps_per_epoch * num_train_epochs
     warmup_fn = optax.linear_schedule(
         init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps
     )
-    if no_decay:
         return warmup_fn
     decay_fn = optax.linear_schedule(
         init_value=learning_rate,
@@ -412,10 +395,7 @@ def wandb_log(metrics, step=None, prefix=None):
 def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
     parser = HfArgumentParser(
         (ModelArguments, DataTrainingArguments, TrainingArguments)
     )
@@ -440,13 +420,13 @@ def main():
         )
     # Make one log on every process with the configuration for debugging.
-    pylogging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         datefmt="%m/%d/%Y %H:%M:%S",
-        level=pylogging.INFO,
     )
     # Setup logging, we only want one process per machine to log things on the screen.
-    logger.setLevel(pylogging.INFO if jax.process_index() == 0 else pylogging.ERROR)
     if jax.process_index() == 0:
         datasets.utils.logging.set_verbosity_warning()
         transformers.utils.logging.set_verbosity_info()
@@ -457,18 +437,19 @@ def main():
     # Set the verbosity to info of the Transformers logger (on main process only):
     logger.info(f"Training/evaluation parameters {training_args}")
-    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    data_files = {
-        "train": data_args.train_file,
-        "validation": data_args.validation_file,
-    }
     dataset = load_dataset(
         data_args.dataset_repo_or_path,
         data_files=data_files,
         streaming=data_args.streaming,
     )
     # Set up wandb run
@@ -477,56 +458,66 @@ def main():
         project="dalle-mini",
         job_type="Seq2Seq",
         config=parser.parse_args(),
-        save_code=True,
     )
-    if model_args.from_checkpoint is not None:
-        artifact = wandb.run.use_artifact(model_args.from_checkpoint)
         artifact_dir = artifact.download()
         model = CustomFlaxBartForConditionalGeneration.from_pretrained(artifact_dir)
         # load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(
             artifact_dir,
-            use_fast=model_args.use_fast_tokenizer,
         )
     else:
         # Set up our new model config
         config = BartConfig.from_pretrained(model_args.model_name_or_path)
-        config.tie_word_embeddings = False
-        config.decoder_start_token_id = BOS_TOKEN_ID  # for first token
-        config.bos_token_id = (
-            BOS_TOKEN_ID  # should not be used (due to forced_bos_token_id)
-        )
-        config.pos_token_id = (
-            BOS_TOKEN_ID  # should not be needed (as we generate until max_length)
-        )
-        config.eos_token_id = BOS_TOKEN_ID + 1  # unreachable
         config.forced_bos_token_id = None  # we don't need this token
         config.forced_eos_token_id = None  # we don't need this token
-        config.force_bos_token_to_be_generated = (
-            False  # otherwise it sets bos_token_id at loading
-        )
-        config.min_length = data_args.max_target_length
-        config.max_length = data_args.max_target_length
         # Create a custom model and initialize it randomly
         model = CustomFlaxBartForConditionalGeneration(
-            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
         # Load tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path,
-            use_fast=model_args.use_fast_tokenizer,
-        )
     print(f"TPUs: {jax.device_count()}")
     assert jax.device_count() == 8, "TPUs in use, please check running processes"
-    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
     # Preprocessing the datasets.
     # We need to tokenize inputs and targets.
@@ -543,7 +534,7 @@ def main():
         shifted_input_ids[:, 0] = decoder_start_token_id
         return shifted_input_ids
-    text_normalizer = TextNormalizer() if data_args.normalize_text else None
     def normalize_text(example):
         example[text_column] = text_normalizer(example[text_column])
@@ -551,7 +542,6 @@ def main():
     def preprocess_function(examples):
         inputs = examples[text_column]
-        inputs = [prefix + inp for inp in inputs] if prefix else inputs
         # Setting padding="max_length" as we need fixed length inputs for jitted functions
         model_inputs = tokenizer(
             inputs,
@@ -589,8 +579,15 @@ def main():
                 else train_dataset.select(range(data_args.max_train_samples))
             )
         if data_args.streaming:
-            train_dataset = train_dataset.shuffle(1000, training_args.seed)
-        if data_args.normalize_text:
             train_dataset = (
                 train_dataset.map(normalize_text)
                 if data_args.streaming
@@ -627,7 +624,7 @@ def main():
                 if data_args.streaming
                 else eval_dataset.select(range(data_args.max_train_samples))
             )
-        if data_args.normalize_text:
             eval_dataset = (
                 eval_dataset.map(normalize_text)
                 if data_args.streaming
@@ -655,7 +652,7 @@ def main():
         )
     # Initialize our training
-    rng = jax.random.PRNGKey(training_args.seed)
     rng, dropout_rng = jax.random.split(rng)
     # Store some constant
@@ -665,35 +662,29 @@ def main():
     )
     batch_size_per_update = train_batch_size * training_args.gradient_accumulation_steps
     eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
     if data_args.streaming:
-        len_train_dataset = data_args.len_train
-        if (
-            data_args.max_train_samples is not None
-            and data_args.max_train_samples < len_train_dataset
-        ):
             len_train_dataset = data_args.max_train_samples
-        len_eval_dataset = data_args.len_eval
-        if (
-            data_args.max_eval_samples is not None
-            and data_args.max_eval_samples < len_eval_dataset
-        ):
             len_eval_dataset = data_args.max_eval_samples
     else:
         len_train_dataset = len(train_dataset)
         len_eval_dataset = len(eval_dataset)
-    steps_per_epoch = len_train_dataset // train_batch_size
-    total_steps = steps_per_epoch * num_epochs
-    total_optimization_steps = (len_train_dataset // batch_size_per_update) * num_epochs
     # Create learning rate schedule
     learning_rate_fn = create_learning_rate_fn(
-        len_train_dataset,
-        train_batch_size,
-        training_args.num_train_epochs,
         training_args.warmup_steps,
         training_args.learning_rate,
-        data_args.no_decay,
     )
     # We use Optax's "masking" functionality to not apply weight decay
@@ -701,8 +692,6 @@ def main():
     # mask boolean with the same structure as the parameters.
     # The mask is True for parameters that should be decayed.
     # Note that this mask is specifically adapted for FlaxBart.
-    # For FlaxT5, one should correct the layer norm parameter naming
-    # accordingly - see `run_t5_mlm_flax.py` e.g.
     def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         layer_norm_params = [
@@ -725,6 +714,9 @@ def main():
         # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
         optimizer = optax.adafactor(
             learning_rate=learning_rate_fn,
         )
     else:
         optimizer = optax.adamw(
@@ -749,11 +741,10 @@ def main():
         tx=optimizer,
         dropout_rng=dropout_rng,
     )
-    if model_args.from_checkpoint is not None:
-        # restore optimizer state and step
         state = state.restore_state(artifact_dir)
-        # TODO: number of remaining training epochs/steps and dataloader state need to be adjusted
-        # TODO: optimizer may use a different step for learning rate, we should serialize/restore entire state
     # label smoothed cross entropy
     def loss_fn(logits, labels):
@@ -762,7 +753,7 @@ def main():
         return loss
     # Define gradient update step fn
-    def train_step(state, batch):
         dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)
         def compute_loss(params, batch):
@@ -776,14 +767,20 @@ def main():
         grad_fn = jax.value_and_grad(compute_loss)
         loss, grads = grad_fn(state.params, batch)
         grads = jax.lax.pmean(grads, "batch")
-        state = state.apply_gradients(grads=grads)
         metrics = {
             "loss": loss,
             "learning_rate": learning_rate_fn(state.step),
         }
         metrics = jax.lax.pmean(metrics, axis_name="batch")
-        return state.replace(dropout_rng=new_dropout_rng), metrics
     # Define eval fn
     def eval_step(params, batch):
@@ -800,10 +797,6 @@ def main():
     p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
     p_eval_step = jax.pmap(eval_step, "batch")
-    # Replicate the train state on each device
-    del model._params
-    state = state.replicate()
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len_train_dataset}")
     logger.info(f"  Num Epochs = {num_epochs}")
@@ -813,13 +806,12 @@ def main():
     logger.info(
         f"  Total train batch size (w. parallel, distributed & gradient accumulation) = {batch_size_per_update}"
     )
-    logger.info(f"  Total global steps = {total_steps}")
-    logger.info(f"  Total optimization steps = {total_optimization_steps}")
-    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
     # set default x-axis as 'train/step'
-    wandb_log({}, step=unreplicate(state.step))
     wandb.define_metric("*", step_metric="train/step")
     # add interesting config parameters
@@ -828,11 +820,12 @@ def main():
             "len_train": len_train_dataset,
             "len_eval": len_eval_dataset,
             "batch_size_per_update": batch_size_per_update,
-            "total_steps": total_steps,
-            "total_optimization_steps": total_optimization_steps,
         }
     )
     def run_evaluation():
         # ======================== Evaluating ==============================
         eval_metrics = []
@@ -840,8 +833,12 @@ def main():
             if data_args.streaming:
                 eval_loader = data_loader_streaming(eval_dataset, eval_batch_size)
             else:
-                eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size)
-            eval_steps = len_eval_dataset // eval_batch_size
             for batch in tqdm(
                 eval_loader,
                 desc="Evaluating...",
@@ -867,10 +864,9 @@ def main():
             return eval_metrics
-    def run_save_model(state, step, epoch, eval_metrics=None):
         if jax.process_index() == 0:
-            params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
             # save model locally
             model.save_pretrained(
                 training_args.output_dir,
@@ -881,24 +877,30 @@ def main():
             tokenizer.save_pretrained(training_args.output_dir)
             # save state
-            # TODO: maybe we should just save the full state object without params
-            state = unreplicate(state)
             with (Path(training_args.output_dir) / "opt_state.msgpack").open("wb") as f:
-                f.write(to_bytes(state.opt_state))
             with (Path(training_args.output_dir) / "training_state.json").open(
                 "w"
             ) as f:
-                json.dump({"step": state.step.item()}, f)
             # save to W&B
-            if data_args.log_model:
                 # save some space
                 c = wandb.wandb_sdk.wandb_artifacts.get_artifacts_cache()
-                c.cleanup(wandb.util.from_human_size("5GB"))
-                metadata = {"step": step, "epoch": epoch}
                 if eval_metrics is not None:
-                    metadata["eval/loss"] = eval_metrics["loss"]
                 artifact = wandb.Artifact(
                     name=f"model-{wandb.run.id}", type="bart_model", metadata=metadata
                 )
@@ -932,24 +934,26 @@ def main():
                     training_args.output_dir,
                     params=params,
                     push_to_hub=training_args.push_to_hub,
-                    commit_message=f"Saving weights and logs of epoch {epoch+1}",
                     temp_dir=True,  # avoid issues with being in a repository
                 )
     for epoch in epochs:
         # ======================== Training ================================
-        step = unreplicate(state.step)
-        wandb_log({"train/epoch": epoch}, step=step)
         # Generate an epoch by shuffling sampling indices from the train dataset
         if data_args.streaming:
-            train_dataset.set_epoch(epoch)
             train_loader = data_loader_streaming(train_dataset, train_batch_size)
         else:
-            rng, input_rng = jax.random.split(rng)
-            train_loader = data_loader(
-                input_rng, train_dataset, train_batch_size, shuffle=True
-            )
         # train
         for batch in tqdm(
             train_loader,
@@ -958,32 +962,49 @@ def main():
             leave=False,
             total=steps_per_epoch,
         ):
-            state, train_metric = p_train_step(state, batch)
             step = unreplicate(state.step)
-            if step % data_args.log_interval == 0 and jax.process_index() == 0:
                 # log metrics
                 wandb_log(unreplicate(train_metric), step=step, prefix="train")
             if training_args.eval_steps and step % training_args.eval_steps == 0:
-                run_evaluation()
-            if step % data_args.save_model_steps == 0:
-                run_save_model(state, step, epoch)
         # log final train metrics
-        wandb_log(unreplicate(train_metric), step=step, prefix="train")
-        train_metric = unreplicate(train_metric)
-        epochs.write(
-            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
-        )
         # Final evaluation
         eval_metrics = run_evaluation()
         # save checkpoint after each epoch
-        run_save_model(state, state.step, epoch, eval_metrics)
 if __name__ == "__main__":

 Fine-tuning the library models for seq2seq, text to image.
 Script adapted from run_summarization_flax.py
 """
 import os
+import logging
 import sys
+import time
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Callable, Optional
 import transformers
 from flax import jax_utils, traverse_util
 from flax.serialization import from_bytes, to_bytes
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
 from transformers import (
     AutoTokenizer,
     HfArgumentParser,
 )
+from transformers.models.bart.modeling_flax_bart import BartConfig
 import wandb
 from dalle_mini.text import TextNormalizer
+from dalle_mini.model import CustomFlaxBartForConditionalGeneration
+logger = logging.getLogger(__name__)
 @dataclass
     """
     model_name_or_path: Optional[str] = field(
+        default=None,
         metadata={
             "help": "The model checkpoint for weights initialization."
             "Don't set if you want to train a model from scratch."
         },
     )
+    image_vocab_size: Optional[int] = field(
         default=None,
+        metadata={"help": "Vocab size of image encoder"},
     )
+    image_length: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of tokens per image"},
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
         metadata={
+            "help": "Pretrained tokenizer name or path if not the same as model_name_or_path"
         },
     )
+    normalize_text: bool = field(
+        default=False,
+        metadata={"help": "Whether to normalize text or not."},
+    )
     dtype: Optional[str] = field(
         default="float32",
         metadata={
             "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
         },
     )
 @dataclass
         default=False,
         metadata={"help": "Whether to stream the dataset."},
     )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use the authentication token for private datasets."
+        },
     )
     max_source_length: Optional[int] = field(
         default=128,
             "than this will be truncated, sequences shorter will be padded."
         },
     )
     max_train_samples: Optional[int] = field(
         default=None,
         metadata={
             "value if set."
         },
     )
     preprocessing_num_workers: Optional[int] = field(
         default=None,
         metadata={
+            "help": "The number of processes to use for the preprocessing. Not used in streaming mode."
         },
     )
     overwrite_cache: bool = field(
         default=False,
+        metadata={
+            "help": "Overwrite the cached training and evaluation sets. Not used in streaming mode."
+        },
+    )
+    def __post_init__(self):
+        if self.dataset_repo_or_path is None:
+            raise ValueError("Need a dataset repository or path.")
+@dataclass
+class TrainingArguments:
+    """
+    Arguments pertaining to training parameters.
+    """
+    output_dir: str = field(
+        metadata={
+            "help": "The output directory where the model predictions and checkpoints will be written."
+        },
+    )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Overwrite the content of the output directory. "
+                "Use this to continue training if output_dir points to a checkpoint directory."
+            )
+        },
+    )
+    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
+    do_eval: bool = field(
+        default=False, metadata={"help": "Whether to run eval on the dev set."}
+    )
+    per_device_train_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
+    )
+    per_device_eval_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
+    )
+    gradient_accumulation_steps: int = field(
+        default=1,
+        metadata={
+            "help": "Number of updates steps to accumulate before performing a backward/update pass."
+        },
+    )
+    learning_rate: float = field(
+        default=5e-5, metadata={"help": "The initial learning rate."}
+    )
+    adafactor: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to replace AdamW by Adafactor."},
+    )
+    weight_decay: float = field(
+        default=None, metadata={"help": "Weight decay if we apply some."}
+    )
+    adam_beta1: float = field(
+        default=0.9, metadata={"help": "Beta1 for AdamW optimizer"}
+    )
+    adam_beta2: float = field(
+        default=0.999, metadata={"help": "Beta2 for AdamW optimizer"}
+    )
+    adam_epsilon: float = field(
+        default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."}
+    )
+    max_grad_norm: float = field(
+        default=1.0, metadata={"help": "Max gradient norm for Adafactor."}
+    )
+    use_decay: bool = field(
+        default=False,
+        metadata={"help": "Whether to use decay in the learning rate scheduler."},
+    )
+    num_train_epochs: float = field(
+        default=3.0, metadata={"help": "Total number of training epochs to perform."}
+    )
+    warmup_steps: int = field(
+        default=0, metadata={"help": "Linear warmup over warmup_steps."}
+    )
+    logging_steps: int = field(
+        default=40, metadata={"help": "Log every X updates steps."}
+    )
+    eval_steps: int = field(
+        default=400, metadata={"help": "Run an evaluation every X steps."}
     )
+    save_steps: int = field(
+        default=4000, metadata={"help": "Save checkpoint every X updates steps."}
     )
     log_model: bool = field(
         default=False,
+        metadata={"help": "Log model to wandb at `save_steps` frequency."},
     )
+    seed_model: int = field(
+        default=42,
+        metadata={
+            "help": "Random seed for the model that will be set at the beginning of training."
+        },
+    )
+    # default seed of None ensures we don't repeat the same items if script was interrupted during an epoch
+    seed_dataset: int = field(
+        default=None,
         metadata={
+            "help": "Random seed for the dataset that will be set at the beginning of training."
         },
     )
+    push_to_hub: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to upload the trained model to the model hub after training."
+        },
+    )
+    resume_from_wandb_checkpoint: Optional[str] = field(
+        default=None,
+        metadata={"help": "The reference to a wandb artifact for resuming training."},
+    )
 class TrainState(train_state.TrainState):
     dropout_rng: jnp.ndarray = None
+    epoch: int = 0
+    train_time: float = 0.0  # total time the model trained
+    train_samples: int = 0  # number of samples seen
     def replicate(self):
         return jax_utils.replicate(self).replace(
         with (Path(artifact_dir) / "opt_state.msgpack").open("rb") as f:
             new_opt_state = from_bytes(self.opt_state, f.read())
+        # restore other parameters
         with (Path(artifact_dir) / "training_state.json").open("r") as f:
             training_state = json.load(f)
         # replace state
+        return self.replace(
+            opt_state=new_opt_state,
+            step=training_state["step"],
+            train_time=training_state["train_time"],
+            train_samples=training_state["train_samples"],
         )
 def data_loader(
+    dataset: Dataset,
+    batch_size: int,
+    rng: jax.random.PRNGKey = None,
 ):
     """
     Returns batches of size `batch_size` from truncated `dataset`, sharded over all local devices.
     """
     steps_per_epoch = len(dataset) // batch_size
+    if rng is not None:
         batch_idx = jax.random.permutation(rng, len(dataset))
     else:
         batch_idx = jnp.arange(len(dataset))
 def create_learning_rate_fn(
     num_warmup_steps: int,
     learning_rate: float,
+    use_decay: bool,
+    num_train_steps: int = None,  # used only with `use_decay`, typically train_size // batch_size * num_epochs
 ) -> Callable[[int], jnp.array]:
     """Returns a linear warmup, linear_decay learning rate function."""
+    if use_decay:
+        assert (
+            num_train_steps is not None
+        ), "Learning rate with decay requires number of training steps"
     warmup_fn = optax.linear_schedule(
         init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps
     )
+    if not use_decay:
         return warmup_fn
     decay_fn = optax.linear_schedule(
         init_value=learning_rate,
 def main():
+    # See all possible arguments by passing the --help flag to this script.
     parser = HfArgumentParser(
         (ModelArguments, DataTrainingArguments, TrainingArguments)
     )
         )
     # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
     )
     # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
     if jax.process_index() == 0:
         datasets.utils.logging.set_verbosity_warning()
         transformers.utils.logging.set_verbosity_info()
     # Set the verbosity to info of the Transformers logger (on main process only):
     logger.info(f"Training/evaluation parameters {training_args}")
+    # Load dataset
+    if data_args.train_file is not None or data_args.validation_file is not None:
+        data_files = {
+            "train": data_args.train_file,
+            "validation": data_args.validation_file,
+        }
+    else:
+        data_files = None
     dataset = load_dataset(
         data_args.dataset_repo_or_path,
         data_files=data_files,
         streaming=data_args.streaming,
+        use_auth_token=data_args.use_auth_token,
     )
     # Set up wandb run
         project="dalle-mini",
         job_type="Seq2Seq",
         config=parser.parse_args(),
     )
+    if training_args.resume_from_wandb_checkpoint is not None:
+        artifact = wandb.run.use_artifact(training_args.resume_from_wandb_checkpoint)
         artifact_dir = artifact.download()
+        # load model
         model = CustomFlaxBartForConditionalGeneration.from_pretrained(artifact_dir)
         # load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(
             artifact_dir,
+            use_fast=True,
         )
     else:
         # Set up our new model config
+        # TODO: simplify with custom config class
         config = BartConfig.from_pretrained(model_args.model_name_or_path)
+        config.image_vocab_size = model_args.image_vocab_size
+        config.image_length = model_args.image_length
+        # we append decoder bos to image vocab
+        config.decoder_start_token_id = config.image_vocab_size
+        # ensure we don't generate bos (in addition to decoder start token)
+        config.force_bos_token_to_be_generated = False
         config.forced_bos_token_id = None  # we don't need this token
         config.forced_eos_token_id = None  # we don't need this token
+        config.tie_word_embeddings = False
+        config.min_length = model_args.image_length + 1
+        config.max_length = model_args.image_length + 1
+        # below tokens need to be set to avoid error during generation (converted to jnp.array)
+        # they are not expected to be used and are set to unreachable token id
+        config.bos_token_id = config.image_vocab_size + 1
+        config.pos_token_id = config.image_vocab_size + 1
+        config.eos_token_id = config.image_vocab_size + 1
+        # save whether we normalize the text
+        config.normalize_text = model_args.normalize_text
         # Create a custom model and initialize it randomly
         model = CustomFlaxBartForConditionalGeneration(
+            config, seed=training_args.seed_model, dtype=getattr(jnp, model_args.dtype)
         )
         # Load tokenizer
+        if model_args.tokenizer_name is not None:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_args.tokenizer_name, use_fast=True
+            )
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_args.model_name_or_path,
+                use_fast=True,
+            )
     print(f"TPUs: {jax.device_count()}")
     assert jax.device_count() == 8, "TPUs in use, please check running processes"
     # Preprocessing the datasets.
     # We need to tokenize inputs and targets.
         shifted_input_ids[:, 0] = decoder_start_token_id
         return shifted_input_ids
+    text_normalizer = TextNormalizer() if model.config.normalize_text else None
     def normalize_text(example):
         example[text_column] = text_normalizer(example[text_column])
     def preprocess_function(examples):
         inputs = examples[text_column]
         # Setting padding="max_length" as we need fixed length inputs for jitted functions
         model_inputs = tokenizer(
             inputs,
                 else train_dataset.select(range(data_args.max_train_samples))
             )
         if data_args.streaming:
+            train_dataset = train_dataset.shuffle(1000, training_args.seed_dataset)
+        else:
+            seed_dataset = (
+                training_args.seed_dataset
+                if training_args.seed_dataset is not None
+                else np.random.get_state()[1][0]
+            )
+            rng_dataset = jax.random.PRNGKey(seed_dataset)
+        if model.config.normalize_text:
             train_dataset = (
                 train_dataset.map(normalize_text)
                 if data_args.streaming
                 if data_args.streaming
                 else eval_dataset.select(range(data_args.max_train_samples))
             )
+        if model.config.normalize_text:
             eval_dataset = (
                 eval_dataset.map(normalize_text)
                 if data_args.streaming
         )
     # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed_model)
     rng, dropout_rng = jax.random.split(rng)
     # Store some constant
     )
     batch_size_per_update = train_batch_size * training_args.gradient_accumulation_steps
     eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    len_train_dataset, len_eval_dataset = None, None
     if data_args.streaming:
+        # we don't know the length, let's just assume max_samples if defined
+        if data_args.max_train_samples is not None:
             len_train_dataset = data_args.max_train_samples
+        if data_args.max_eval_samples is not None:
             len_eval_dataset = data_args.max_eval_samples
     else:
         len_train_dataset = len(train_dataset)
         len_eval_dataset = len(eval_dataset)
+    steps_per_epoch = (
+        len_train_dataset // train_batch_size if len_train_dataset is not None else None
+    )
+    num_train_steps = (
+        steps_per_epoch * num_epochs if steps_per_epoch is not None else None
+    )
     # Create learning rate schedule
     learning_rate_fn = create_learning_rate_fn(
         training_args.warmup_steps,
         training_args.learning_rate,
+        training_args.use_decay,
+        num_train_steps,
     )
     # We use Optax's "masking" functionality to not apply weight decay
     # mask boolean with the same structure as the parameters.
     # The mask is True for parameters that should be decayed.
     # Note that this mask is specifically adapted for FlaxBart.
     def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         layer_norm_params = [
         # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
         optimizer = optax.adafactor(
             learning_rate=learning_rate_fn,
+            weight_decay_rate=training_args.weight_decay,
+            weight_decay_mask=decay_mask_fn,
+            clipping_threshold=training_args.max_grad_norm,
         )
     else:
         optimizer = optax.adamw(
         tx=optimizer,
         dropout_rng=dropout_rng,
     )
+    if training_args.resume_from_wandb_checkpoint is not None:
+        # restore optimizer state and other parameters
+        # we currently ignore partial epoch training: see https://github.com/borisdayma/dalle-mini/issues/105
         state = state.restore_state(artifact_dir)
     # label smoothed cross entropy
     def loss_fn(logits, labels):
         return loss
     # Define gradient update step fn
+    def train_step(state, batch, delta_time):
         dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)
         def compute_loss(params, batch):
         grad_fn = jax.value_and_grad(compute_loss)
         loss, grads = grad_fn(state.params, batch)
         grads = jax.lax.pmean(grads, "batch")
+        state = state.apply_gradients(
+            grads=grads,
+            dropout_rng=new_dropout_rng,
+            train_time=state.train_time + delta_time,
+            train_samples=state.train_samples + train_batch_size,
+        )
         metrics = {
             "loss": loss,
             "learning_rate": learning_rate_fn(state.step),
         }
         metrics = jax.lax.pmean(metrics, axis_name="batch")
+        return state, metrics
     # Define eval fn
     def eval_step(params, batch):
     p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
     p_eval_step = jax.pmap(eval_step, "batch")
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len_train_dataset}")
     logger.info(f"  Num Epochs = {num_epochs}")
     logger.info(
         f"  Total train batch size (w. parallel, distributed & gradient accumulation) = {batch_size_per_update}"
     )
+    epochs = tqdm(
+        range(state.epoch, num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0
+    )
     # set default x-axis as 'train/step'
+    wandb_log({}, step=state.step)
     wandb.define_metric("*", step_metric="train/step")
     # add interesting config parameters
             "len_train": len_train_dataset,
             "len_eval": len_eval_dataset,
             "batch_size_per_update": batch_size_per_update,
         }
     )
+    # replicate state on each device
+    state = state.replicate()
     def run_evaluation():
         # ======================== Evaluating ==============================
         eval_metrics = []
             if data_args.streaming:
                 eval_loader = data_loader_streaming(eval_dataset, eval_batch_size)
             else:
+                eval_loader = data_loader(eval_dataset, eval_batch_size)
+            eval_steps = (
+                len_eval_dataset // eval_batch_size
+                if len_eval_dataset is not None
+                else None
+            )
             for batch in tqdm(
                 eval_loader,
                 desc="Evaluating...",
             return eval_metrics
+    def run_save_model(state, eval_metrics=None):
         if jax.process_index() == 0:
+            params = jax.device_get(unreplicate(state.params))
             # save model locally
             model.save_pretrained(
                 training_args.output_dir,
             tokenizer.save_pretrained(training_args.output_dir)
             # save state
+            opt_state = unreplicate(state.opt_state)
             with (Path(training_args.output_dir) / "opt_state.msgpack").open("wb") as f:
+                f.write(to_bytes(opt_state))
+            state_dict = {
+                k: jax.device_get(unreplicate(getattr(state, k))).item()
+                for k in ["step", "epoch", "train_time", "train_samples"]
+            }
             with (Path(training_args.output_dir) / "training_state.json").open(
                 "w"
             ) as f:
+                json.dump(
+                    state_dict,
+                    f,
+                )
             # save to W&B
+            if training_args.log_model:
                 # save some space
                 c = wandb.wandb_sdk.wandb_artifacts.get_artifacts_cache()
+                c.cleanup(wandb.util.from_human_size("10GB"))
+                metadata = dict(state_dict)
                 if eval_metrics is not None:
+                    metadata["eval"] = eval_metrics
                 artifact = wandb.Artifact(
                     name=f"model-{wandb.run.id}", type="bart_model", metadata=metadata
                 )
                     training_args.output_dir,
                     params=params,
                     push_to_hub=training_args.push_to_hub,
+                    commit_message=f"Saving weights and logs at step {unreplicate(state.step)+1}",
                     temp_dir=True,  # avoid issues with being in a repository
                 )
+    # init variables
+    last_time = time.perf_counter()
+    train_metric = None
     for epoch in epochs:
+        state.replace(epoch=jax_utils.replicate(epoch))
         # ======================== Training ================================
+        wandb_log({"train/epoch": epoch}, step=unreplicate(state.step))
         # Generate an epoch by shuffling sampling indices from the train dataset
         if data_args.streaming:
+            train_dataset.set_epoch(epoch)  # shuffle dataset
             train_loader = data_loader_streaming(train_dataset, train_batch_size)
         else:
+            rng_dataset, input_rng = jax.random.split(rng_dataset)
+            train_loader = data_loader(train_dataset, train_batch_size, rng=input_rng)
         # train
         for batch in tqdm(
             train_loader,
             leave=False,
             total=steps_per_epoch,
         ):
+            # calculate delta time (we have a lag of one step but it's ok)
+            new_time = time.perf_counter()
+            delta_time = new_time - last_time
+            last_time = new_time
+            # train step
+            state, train_metric = p_train_step(
+                state, batch, jax_utils.replicate(delta_time)
+            )
             step = unreplicate(state.step)
+            if step % training_args.logging_steps == 0 and jax.process_index() == 0:
                 # log metrics
                 wandb_log(unreplicate(train_metric), step=step, prefix="train")
+                # log state parameters
+                state_dict = {
+                    k.split("_")[-1]: unreplicate(getattr(state, k))
+                    for k in ["epoch", "train_time", "train_samples"]
+                }
+                wandb_log(state_dict, step=step, prefix="train")
+            eval_metrics = None
             if training_args.eval_steps and step % training_args.eval_steps == 0:
+                eval_metrics = run_evaluation()
+            if step % training_args.save_steps == 0:
+                run_save_model(state, eval_metrics)
         # log final train metrics
+        if train_metric is not None:
+            train_metric = unreplicate(train_metric)
+            wandb_log(train_metric, step=step, prefix="train")
+            epochs.write(
+                f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+            )
         # Final evaluation
         eval_metrics = run_evaluation()
         # save checkpoint after each epoch
+        run_save_model(state, eval_metrics)
 if __name__ == "__main__":

setup.cfg CHANGED Viewed

@@ -12,5 +12,7 @@ project_urls =
 packages = find:
 install_requires =
     transformers
     jax
     flax

 packages = find:
 install_requires =
     transformers
+    unidecode
+    ftfy
     jax
     flax