Saving weights and logs of step 10000

Browse files

Files changed (5) hide show

config.json +1 -1
events.out.tfevents.1637651508.t1v-n-8eba1090-w-0.74811.0.v2 → events.out.tfevents.1637788246.t1v-n-8eba1090-w-0.278309.0.v2 +2 -2
flax_model.msgpack +2 -2
run_mlm_flax.py +23 -133
start_train.sh +2 -3

config.json CHANGED Viewed

@@ -19,7 +19,7 @@
   "num_hidden_layers": 24,
   "pad_token_id": 1,
   "position_embedding_type": "absolute",
-  "transformers_version": "4.13.0.dev0",
   "type_vocab_size": 1,
   "use_cache": true,
   "vocab_size": 50265

   "num_hidden_layers": 24,
   "pad_token_id": 1,
   "position_embedding_type": "absolute",
+  "transformers_version": "4.11.0",
   "type_vocab_size": 1,
   "use_cache": true,
   "vocab_size": 50265

events.out.tfevents.1637651508.t1v-n-8eba1090-w-0.74811.0.v2 → events.out.tfevents.1637788246.t1v-n-8eba1090-w-0.278309.0.v2 RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a698de76a6eef179b50ae1f446a42233b23305d204aea21414c9c719c958894a
-size 8912195

 version https://git-lfs.github.com/spec/v1
+oid sha256:cedc456912f39cab6a93851a70164212aeff38666e4e7d06802401d8ff4983c9
+size 1470757

flax_model.msgpack CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3da37fdf3d6ea94d5fcc73090e445b143f824569cf20c9d2a5779a5566dd3c7d
-size 1421662309

 version https://git-lfs.github.com/spec/v1
+oid sha256:f89a2f1cb697fef6bf98fda870fd214efe0fb3874f01fdc75e5beaed3bef05d0
+size 711588089

run_mlm_flax.py CHANGED Viewed

@@ -16,7 +16,6 @@
 """
 Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
 text file or a dataset.
 Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 https://huggingface.co/models?filter=masked-lm
 """
@@ -25,15 +24,12 @@ import os
 import sys
 import time
 from dataclasses import dataclass, field
 # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import numpy as np
 from datasets import load_dataset, load_from_disk
 from tqdm import tqdm
 import flax
 import jax
 import jax.numpy as jnp
@@ -41,7 +37,6 @@ import optax
 from flax import jax_utils, traverse_util
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
-from huggingface_hub import Repository
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_MASKED_LM_MAPPING,
@@ -55,19 +50,13 @@ from transformers import (
     is_tensorboard_available,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 @dataclass
 class ModelArguments:
     """
     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
     """
     model_name_or_path: Optional[str] = field(
         default=None,
         metadata={
@@ -98,14 +87,11 @@ class ModelArguments:
             "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
         },
     )
 @dataclass
 class DataTrainingArguments:
     """
     Arguments pertaining to what data we are going to input our model for training and eval.
     """
     dataset_name: Optional[str] = field(
         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
     )
@@ -168,7 +154,6 @@ class DataTrainingArguments:
         default=False,
         metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
     )
     def __post_init__(self):
         if self.dataset_name is None and self.train_file is None and self.dataset_filepath is None and self.validation_file is None:
             raise ValueError("Need either a dataset name or a training/validation file.")
@@ -179,50 +164,39 @@ class DataTrainingArguments:
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 @flax.struct.dataclass
 class FlaxDataCollatorForLanguageModeling:
     """
     Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
     are not all of the same length.
     Args:
         tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
             The tokenizer used for encoding the data.
         mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
             The probability with which to (randomly) mask tokens in the input.
     .. note::
         For best performance, this data collator should be used with a dataset having items that are dictionaries or
         BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
         :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
         argument :obj:`return_special_tokens_mask=True`.
     """
     tokenizer: PreTrainedTokenizerBase
     mlm_probability: float = 0.15
     def __post_init__(self):
         if self.tokenizer.mask_token is None:
             raise ValueError(
                 "This tokenizer does not have a mask token which is necessary for masked language modeling. "
                 "You should pass `mlm=False` to train on causal language modeling instead."
             )
     def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
         # Handle dict or lists with proper padding and conversion to tensor.
         batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
         # If special token mask has been preprocessed, pop it from the dict.
         special_tokens_mask = batch.pop("special_tokens_mask", None)
         batch["input_ids"], batch["labels"] = self.mask_tokens(
             batch["input_ids"], special_tokens_mask=special_tokens_mask
         )
         return batch
     def mask_tokens(
         self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
     ) -> Tuple[np.ndarray, np.ndarray]:
@@ -233,57 +207,41 @@ class FlaxDataCollatorForLanguageModeling:
         # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
         probability_matrix = np.full(labels.shape, self.mlm_probability)
         special_tokens_mask = special_tokens_mask.astype("bool")
         probability_matrix[special_tokens_mask] = 0.0
         masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
         labels[~masked_indices] = -100  # We only compute loss on masked tokens
         # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
         indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
         inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
         # 10% of the time, we replace masked input tokens with random word
         indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
         indices_random &= masked_indices & ~indices_replaced
         random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
         inputs[indices_random] = random_words[indices_random]
         # The rest of the time (10% of the time) we keep the masked input tokens unchanged
         return inputs, labels
 def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
     num_samples = len(samples_idx)
     samples_to_remove = num_samples % batch_size
     if samples_to_remove != 0:
         samples_idx = samples_idx[:-samples_to_remove]
     sections_split = num_samples // batch_size
     batch_idx = np.split(samples_idx, sections_split)
     return batch_idx
 def write_train_metric(summary_writer, train_metrics, train_time, step):
     summary_writer.scalar("train_time", train_time, step)
     train_metrics = get_metrics(train_metrics)
     for key, vals in train_metrics.items():
         tag = f"train_{key}"
         for i, val in enumerate(vals):
             summary_writer.scalar(tag, val, step - len(vals) + i + 1)
 def write_eval_metric(summary_writer, eval_metrics, step):
     for metric_name, value in eval_metrics.items():
         summary_writer.scalar(f"eval_{metric_name}", value, step)
 if __name__ == "__main__":
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
@@ -291,7 +249,6 @@ if __name__ == "__main__":
         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     if (
         os.path.exists(training_args.output_dir)
         and os.listdir(training_args.output_dir)
@@ -302,33 +259,18 @@ if __name__ == "__main__":
             f"Output directory ({training_args.output_dir}) already exists and is not empty."
             "Use --overwrite_output_dir to overcome."
         )
     # Setup logging
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         level="NOTSET",
         datefmt="[%X]",
     )
     # Log on each process the small summary:
     logger = logging.getLogger(__name__)
     # Set the verbosity to info of the Transformers logger (on main process only):
     logger.info(f"Training/evaluation parameters {training_args}")
     # Set seed before initializing model.
     set_seed(training_args.seed)
-    # Handle the repository creation
-    if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        repo = Repository(training_args.output_dir, clone_from=repo_name)
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
     # (the dataset will be downloaded automatically from the datasets Hub).
@@ -341,7 +283,6 @@ if __name__ == "__main__":
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
         if "validation" not in datasets.keys():
             datasets["validation"] = load_dataset(
                 data_args.dataset_name,
@@ -355,7 +296,6 @@ if __name__ == "__main__":
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
             )
     elif data_args.dataset_filepath is not None:
         # Loading a dataset from local file.
         datasets = load_from_disk(data_args.dataset_filepath)
@@ -363,7 +303,6 @@ if __name__ == "__main__":
             datasets = datasets.train_test_split(test_size=data_args.validation_split_percentage/100)
             datasets["validation"] = datasets["test"]
             del datasets["test"]
     else:
         data_files = {}
         if data_args.train_file is not None:
@@ -374,7 +313,6 @@ if __name__ == "__main__":
         if extension == "txt":
             extension = "text"
         datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
         if "validation" not in datasets.keys():
             datasets["validation"] = load_dataset(
                 extension,
@@ -390,9 +328,7 @@ if __name__ == "__main__":
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
     # Load pretrained model and tokenizer
     # Distributed training:
     # The .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
@@ -403,7 +339,6 @@ if __name__ == "__main__":
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
@@ -417,7 +352,6 @@ if __name__ == "__main__":
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
         )
     # Preprocessing the datasets.
     # First we tokenize all the texts.
     if training_args.do_train:
@@ -425,13 +359,10 @@ if __name__ == "__main__":
     else:
         column_names = datasets["validation"].column_names
     text_column_name = "text" if "text" in column_names else column_names[0]
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
     if data_args.line_by_line:
         # When using line_by_line, we just tokenize each nonempty line.
         padding = "max_length" if data_args.pad_to_max_length else False
         def tokenize_function(examples):
             # Remove empty lines
             examples = [line for line in examples if len(line) > 0 and not line.isspace()]
@@ -442,7 +373,6 @@ if __name__ == "__main__":
                 truncation=True,
                 max_length=max_seq_length,
             )
         if data_args.tokenized_dataset_filepath is not None:
             # Loading a tokenized dataset from local file.
             tokenized_datasets = load_from_disk(data_args.tokenized_dataset_filepath)
@@ -455,19 +385,16 @@ if __name__ == "__main__":
                 remove_columns=column_names,
                 load_from_cache_file=not data_args.overwrite_cache,
             )
     else:
         # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
         # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
         # efficient when it receives the `special_tokens_mask`.
         def tokenize_function(examples):
             return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
         if data_args.tokenized_dataset_filepath is not None:
             # Loading a tokenized dataset from local file.
             tokenized_datasets = load_from_disk(data_args.tokenized_dataset_filepath)
         else:
             tokenized_datasets = datasets.map(
                 tokenize_function,
                 batched=True,
@@ -475,7 +402,6 @@ if __name__ == "__main__":
                 remove_columns=column_names,
                 load_from_cache_file=not data_args.overwrite_cache,
             )
             # Main data processing function that will concatenate all texts from our dataset and generate chunks of
             # max_seq_length.
             def group_texts(examples):
@@ -492,7 +418,6 @@ if __name__ == "__main__":
                     for k, t in concatenated_examples.items()
                 }
                 return result
             # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
             # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
             # might be slower to preprocess.
@@ -505,18 +430,23 @@ if __name__ == "__main__":
                 num_proc=data_args.preprocessing_num_workers,
                 load_from_cache_file=not data_args.overwrite_cache,
             )
     # save the tokenized dataset for future runs
     if data_args.save_tokenized_dataset_filepath is not None:
         tokenized_datasets.save_to_disk(data_args.save_tokenized_dataset_filepath)
     # Enable tensorboard only on the master node
     has_tensorboard = is_tensorboard_available()
     if has_tensorboard and jax.process_index() == 0:
         try:
             from flax.metrics.tensorboard import SummaryWriter
             summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
         except ImportError as ie:
             has_tensorboard = False
@@ -528,15 +458,12 @@ if __name__ == "__main__":
             "Unable to display metrics through TensorBoard because the package is not installed: "
             "Please run pip install tensorboard to enable."
         )
     # Data collator
     # This one will take care of randomly masking the tokens.
     data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
     # Initialize our training
     rng = jax.random.PRNGKey(training_args.seed)
     dropout_rngs = jax.random.split(rng, jax.local_device_count())
     if model_args.model_name_or_path:
         model = FlaxAutoModelForMaskedLM.from_pretrained(
             model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
@@ -545,14 +472,11 @@ if __name__ == "__main__":
         model = FlaxAutoModelForMaskedLM.from_config(
             config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
     train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
     eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
     num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
     # Create learning rate schedule
     warmup_fn = optax.linear_schedule(
         init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
@@ -565,7 +489,6 @@ if __name__ == "__main__":
     linear_decay_lr_schedule_fn = optax.join_schedules(
         schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
     )
     # We use Optax's "masking" functionality to not apply weight decay
     # to bias and LayerNorm scale parameters. decay_mask_fn returns a
     # mask boolean with the same structure as the parameters.
@@ -577,7 +500,6 @@ if __name__ == "__main__":
         flat_params = traverse_util.flatten_dict(params)
         flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
         return traverse_util.unflatten_dict(flat_mask)
     # create adam optimizer
     if training_args.adafactor:
         # We use the default parameters here to initialize adafactor,
@@ -594,153 +516,121 @@ if __name__ == "__main__":
             weight_decay=training_args.weight_decay,
             mask=decay_mask_fn,
         )
     # Setup train state
     state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
     # Define gradient update step fn
     def train_step(state, batch, dropout_rng):
         dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
         def loss_fn(params):
             labels = batch.pop("labels")
             logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
             # compute loss, ignore padded input tokens
             label_mask = jnp.where(labels > 0, 1.0, 0.0)
             loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
             # take average
             loss = loss.sum() / label_mask.sum()
             return loss
         grad_fn = jax.value_and_grad(loss_fn)
         loss, grad = grad_fn(state.params)
         grad = jax.lax.pmean(grad, "batch")
         new_state = state.apply_gradients(grads=grad)
         metrics = jax.lax.pmean(
             {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
         )
         return new_state, metrics, new_dropout_rng
     # Create parallel version of the train step
     p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
     # Define eval fn
     def eval_step(params, batch):
         labels = batch.pop("labels")
         logits = model(**batch, params=params, train=False)[0]
         # compute loss, ignore padded input tokens
         label_mask = jnp.where(labels > 0, 1.0, 0.0)
         loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
         # compute accuracy
         accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) * label_mask
         # summarize metrics
         metrics = {"loss": loss.sum(), "accuracy": accuracy.sum(), "normalizer": label_mask.sum()}
         metrics = jax.lax.psum(metrics, axis_name="batch")
         return metrics
     p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
     # Replicate the train state on each device
     state = jax_utils.replicate(state)
     train_time = 0
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
     for epoch in epochs:
         # ======================== Training ================================
         train_start = time.time()
         train_metrics = []
         # Create sampling rng
         rng, input_rng = jax.random.split(rng)
         # Generate an epoch by shuffling sampling indices from the train dataset
         num_train_samples = len(tokenized_datasets["train"])
         train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
         train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
         # Gather the indexes for creating the batch and do a training step
         for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
             samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
             model_inputs = data_collator(samples, pad_to_multiple_of=16)
             # Model forward
             model_inputs = shard(model_inputs.data)
             state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
             train_metrics.append(train_metric)
             cur_step = epoch * (num_train_samples // train_batch_size) + step
             if cur_step % training_args.logging_steps == 0 and cur_step > 0:
                 # Save metrics
                 train_metric = jax_utils.unreplicate(train_metric)
                 train_time += time.time() - train_start
                 if has_tensorboard and jax.process_index() == 0:
                     write_train_metric(summary_writer, train_metrics, train_time, cur_step)
                 epochs.write(
                     f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
                 )
                 train_metrics = []
             if cur_step % training_args.eval_steps == 0 and cur_step > 0:
                 # ======================== Evaluating ==============================
                 num_eval_samples = len(tokenized_datasets["validation"])
                 eval_samples_idx = jnp.arange(num_eval_samples)
                 eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
                 eval_metrics = []
                 for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
                     samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
                     model_inputs = data_collator(samples, pad_to_multiple_of=16)
                     # Model forward
                     model_inputs = shard(model_inputs.data)
                     metrics = p_eval_step(state.params, model_inputs)
                     eval_metrics.append(metrics)
                 # normalize eval metrics
                 eval_metrics = get_metrics(eval_metrics)
                 eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
                 eval_normalizer = eval_metrics.pop("normalizer")
                 eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
                 # Update progress bar
                 epochs.desc = f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
                 # Save metrics
                 if has_tensorboard and jax.process_index() == 0:
                     write_eval_metric(summary_writer, eval_metrics, cur_step)
             if cur_step % training_args.save_steps == 0 and cur_step > 0:
                 # save checkpoint after each epoch and push checkpoint to the hub
                 if jax.process_index() == 0:
                     params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-                    model.save_pretrained(training_args.output_dir, params=params)
-                    tokenizer.save_pretrained(training_args.output_dir)
-                    if training_args.push_to_hub:
-                        repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False)
         # save also at the end of epoch
         try:
             if jax.process_index() == 0:
                 params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-                model.save_pretrained(training_args.output_dir, params=params)
-                tokenizer.save_pretrained(training_args.output_dir)
-                if training_args.push_to_hub:
-                    repo.push_to_hub(commit_message=f"Saving weights and logs of epoch {epoch}", blocking=False)
         except:
             # push to hub fails the whole script if nothing new to commit
-            pass

 """
 Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
 text file or a dataset.
 Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 https://huggingface.co/models?filter=masked-lm
 """
 import sys
 import time
 from dataclasses import dataclass, field
 # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import numpy as np
 from datasets import load_dataset, load_from_disk
 from tqdm import tqdm
 import flax
 import jax
 import jax.numpy as jnp
 from flax import jax_utils, traverse_util
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_MASKED_LM_MAPPING,
     is_tensorboard_available,
     set_seed,
 )
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 @dataclass
 class ModelArguments:
     """
     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
     """
     model_name_or_path: Optional[str] = field(
         default=None,
         metadata={
             "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
         },
     )
 @dataclass
 class DataTrainingArguments:
     """
     Arguments pertaining to what data we are going to input our model for training and eval.
     """
     dataset_name: Optional[str] = field(
         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
     )
         default=False,
         metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
     )
     def __post_init__(self):
         if self.dataset_name is None and self.train_file is None and self.dataset_filepath is None and self.validation_file is None:
             raise ValueError("Need either a dataset name or a training/validation file.")
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 @flax.struct.dataclass
 class FlaxDataCollatorForLanguageModeling:
     """
     Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
     are not all of the same length.
     Args:
         tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
             The tokenizer used for encoding the data.
         mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
             The probability with which to (randomly) mask tokens in the input.
     .. note::
         For best performance, this data collator should be used with a dataset having items that are dictionaries or
         BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
         :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
         argument :obj:`return_special_tokens_mask=True`.
     """
     tokenizer: PreTrainedTokenizerBase
     mlm_probability: float = 0.15
     def __post_init__(self):
         if self.tokenizer.mask_token is None:
             raise ValueError(
                 "This tokenizer does not have a mask token which is necessary for masked language modeling. "
                 "You should pass `mlm=False` to train on causal language modeling instead."
             )
     def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
         # Handle dict or lists with proper padding and conversion to tensor.
         batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
         # If special token mask has been preprocessed, pop it from the dict.
         special_tokens_mask = batch.pop("special_tokens_mask", None)
         batch["input_ids"], batch["labels"] = self.mask_tokens(
             batch["input_ids"], special_tokens_mask=special_tokens_mask
         )
         return batch
     def mask_tokens(
         self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
     ) -> Tuple[np.ndarray, np.ndarray]:
         # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
         probability_matrix = np.full(labels.shape, self.mlm_probability)
         special_tokens_mask = special_tokens_mask.astype("bool")
         probability_matrix[special_tokens_mask] = 0.0
         masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
         labels[~masked_indices] = -100  # We only compute loss on masked tokens
         # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
         indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
         inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
         # 10% of the time, we replace masked input tokens with random word
         indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
         indices_random &= masked_indices & ~indices_replaced
         random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
         inputs[indices_random] = random_words[indices_random]
         # The rest of the time (10% of the time) we keep the masked input tokens unchanged
         return inputs, labels
 def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
     num_samples = len(samples_idx)
     samples_to_remove = num_samples % batch_size
     if samples_to_remove != 0:
         samples_idx = samples_idx[:-samples_to_remove]
     sections_split = num_samples // batch_size
     batch_idx = np.split(samples_idx, sections_split)
     return batch_idx
 def write_train_metric(summary_writer, train_metrics, train_time, step):
     summary_writer.scalar("train_time", train_time, step)
     train_metrics = get_metrics(train_metrics)
     for key, vals in train_metrics.items():
         tag = f"train_{key}"
         for i, val in enumerate(vals):
             summary_writer.scalar(tag, val, step - len(vals) + i + 1)
 def write_eval_metric(summary_writer, eval_metrics, step):
     for metric_name, value in eval_metrics.items():
         summary_writer.scalar(f"eval_{metric_name}", value, step)
 if __name__ == "__main__":
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     if (
         os.path.exists(training_args.output_dir)
         and os.listdir(training_args.output_dir)
             f"Output directory ({training_args.output_dir}) already exists and is not empty."
             "Use --overwrite_output_dir to overcome."
         )
     # Setup logging
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         level="NOTSET",
         datefmt="[%X]",
     )
     # Log on each process the small summary:
     logger = logging.getLogger(__name__)
     # Set the verbosity to info of the Transformers logger (on main process only):
     logger.info(f"Training/evaluation parameters {training_args}")
     # Set seed before initializing model.
     set_seed(training_args.seed)
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
     # (the dataset will be downloaded automatically from the datasets Hub).
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
         if "validation" not in datasets.keys():
             datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
             )
     elif data_args.dataset_filepath is not None:
         # Loading a dataset from local file.
         datasets = load_from_disk(data_args.dataset_filepath)
             datasets = datasets.train_test_split(test_size=data_args.validation_split_percentage/100)
             datasets["validation"] = datasets["test"]
             del datasets["test"]
     else:
         data_files = {}
         if data_args.train_file is not None:
         if extension == "txt":
             extension = "text"
         datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
         if "validation" not in datasets.keys():
             datasets["validation"] = load_dataset(
                 extension,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
     # Load pretrained model and tokenizer
     # Distributed training:
     # The .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
         )
     # Preprocessing the datasets.
     # First we tokenize all the texts.
     if training_args.do_train:
     else:
         column_names = datasets["validation"].column_names
     text_column_name = "text" if "text" in column_names else column_names[0]
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
     if data_args.line_by_line:
         # When using line_by_line, we just tokenize each nonempty line.
         padding = "max_length" if data_args.pad_to_max_length else False
         def tokenize_function(examples):
             # Remove empty lines
             examples = [line for line in examples if len(line) > 0 and not line.isspace()]
                 truncation=True,
                 max_length=max_seq_length,
             )
         if data_args.tokenized_dataset_filepath is not None:
             # Loading a tokenized dataset from local file.
             tokenized_datasets = load_from_disk(data_args.tokenized_dataset_filepath)
                 remove_columns=column_names,
                 load_from_cache_file=not data_args.overwrite_cache,
             )
     else:
         # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
         # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
         # efficient when it receives the `special_tokens_mask`.
         def tokenize_function(examples):
             return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
         if data_args.tokenized_dataset_filepath is not None:
             # Loading a tokenized dataset from local file.
             tokenized_datasets = load_from_disk(data_args.tokenized_dataset_filepath)
         else:
             tokenized_datasets = datasets.map(
                 tokenize_function,
                 batched=True,
                 remove_columns=column_names,
                 load_from_cache_file=not data_args.overwrite_cache,
             )
             # Main data processing function that will concatenate all texts from our dataset and generate chunks of
             # max_seq_length.
             def group_texts(examples):
                     for k, t in concatenated_examples.items()
                 }
                 return result
             # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
             # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
             # might be slower to preprocess.
                 num_proc=data_args.preprocessing_num_workers,
                 load_from_cache_file=not data_args.overwrite_cache,
             )
     # save the tokenized dataset for future runs
     if data_args.save_tokenized_dataset_filepath is not None:
+        if data_args.dataset_filepath is not None:
+            try:
+                os.system(f"sudo rm {data_args.dataset_filepath}/train/cache*")
+                os.system(f"sudo rm {data_args.dataset_filepath}/validation/cache*")
+                os.system(f"sudo rm {data_args.dataset_filepath}/train/tmp*")
+                os.system(f"sudo rm {data_args.dataset_filepath}/validation/tmp*")
+            except:
+                pass
         tokenized_datasets.save_to_disk(data_args.save_tokenized_dataset_filepath)
     # Enable tensorboard only on the master node
     has_tensorboard = is_tensorboard_available()
     if has_tensorboard and jax.process_index() == 0:
         try:
             from flax.metrics.tensorboard import SummaryWriter
             summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
         except ImportError as ie:
             has_tensorboard = False
             "Unable to display metrics through TensorBoard because the package is not installed: "
             "Please run pip install tensorboard to enable."
         )
     # Data collator
     # This one will take care of randomly masking the tokens.
     data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
     # Initialize our training
     rng = jax.random.PRNGKey(training_args.seed)
     dropout_rngs = jax.random.split(rng, jax.local_device_count())
     if model_args.model_name_or_path:
         model = FlaxAutoModelForMaskedLM.from_pretrained(
             model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         model = FlaxAutoModelForMaskedLM.from_config(
             config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
     train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
     eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
     num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
     # Create learning rate schedule
     warmup_fn = optax.linear_schedule(
         init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
     linear_decay_lr_schedule_fn = optax.join_schedules(
         schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
     )
     # We use Optax's "masking" functionality to not apply weight decay
     # to bias and LayerNorm scale parameters. decay_mask_fn returns a
     # mask boolean with the same structure as the parameters.
         flat_params = traverse_util.flatten_dict(params)
         flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
         return traverse_util.unflatten_dict(flat_mask)
     # create adam optimizer
     if training_args.adafactor:
         # We use the default parameters here to initialize adafactor,
             weight_decay=training_args.weight_decay,
             mask=decay_mask_fn,
         )
     # Setup train state
     state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
     # Define gradient update step fn
     def train_step(state, batch, dropout_rng):
         dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
         def loss_fn(params):
             labels = batch.pop("labels")
             logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
             # compute loss, ignore padded input tokens
             label_mask = jnp.where(labels > 0, 1.0, 0.0)
             loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
             # take average
             loss = loss.sum() / label_mask.sum()
             return loss
         grad_fn = jax.value_and_grad(loss_fn)
         loss, grad = grad_fn(state.params)
         grad = jax.lax.pmean(grad, "batch")
         new_state = state.apply_gradients(grads=grad)
         metrics = jax.lax.pmean(
             {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
         )
         return new_state, metrics, new_dropout_rng
     # Create parallel version of the train step
     p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
     # Define eval fn
     def eval_step(params, batch):
         labels = batch.pop("labels")
         logits = model(**batch, params=params, train=False)[0]
         # compute loss, ignore padded input tokens
         label_mask = jnp.where(labels > 0, 1.0, 0.0)
         loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
         # compute accuracy
         accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) * label_mask
         # summarize metrics
         metrics = {"loss": loss.sum(), "accuracy": accuracy.sum(), "normalizer": label_mask.sum()}
         metrics = jax.lax.psum(metrics, axis_name="batch")
         return metrics
     p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
     # Replicate the train state on each device
     state = jax_utils.replicate(state)
     train_time = 0
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
     for epoch in epochs:
         # ======================== Training ================================
         train_start = time.time()
         train_metrics = []
         # Create sampling rng
         rng, input_rng = jax.random.split(rng)
         # Generate an epoch by shuffling sampling indices from the train dataset
         num_train_samples = len(tokenized_datasets["train"])
         train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
         train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
         # Gather the indexes for creating the batch and do a training step
         for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
             samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
             model_inputs = data_collator(samples, pad_to_multiple_of=16)
             # Model forward
             model_inputs = shard(model_inputs.data)
             state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
             train_metrics.append(train_metric)
             cur_step = epoch * (num_train_samples // train_batch_size) + step
             if cur_step % training_args.logging_steps == 0 and cur_step > 0:
                 # Save metrics
                 train_metric = jax_utils.unreplicate(train_metric)
                 train_time += time.time() - train_start
                 if has_tensorboard and jax.process_index() == 0:
                     write_train_metric(summary_writer, train_metrics, train_time, cur_step)
                 epochs.write(
                     f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
                 )
                 train_metrics = []
             if cur_step % training_args.eval_steps == 0 and cur_step > 0:
                 # ======================== Evaluating ==============================
                 num_eval_samples = len(tokenized_datasets["validation"])
                 eval_samples_idx = jnp.arange(num_eval_samples)
                 eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
                 eval_metrics = []
                 for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
                     samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
                     model_inputs = data_collator(samples, pad_to_multiple_of=16)
                     # Model forward
                     model_inputs = shard(model_inputs.data)
                     metrics = p_eval_step(state.params, model_inputs)
                     eval_metrics.append(metrics)
                 # normalize eval metrics
                 eval_metrics = get_metrics(eval_metrics)
                 eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
                 eval_normalizer = eval_metrics.pop("normalizer")
                 eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
                 # Update progress bar
                 epochs.desc = f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
                 # Save metrics
                 if has_tensorboard and jax.process_index() == 0:
                     write_eval_metric(summary_writer, eval_metrics, cur_step)
             if cur_step % training_args.save_steps == 0 and cur_step > 0:
                 # save checkpoint after each epoch and push checkpoint to the hub
                 if jax.process_index() == 0:
                     params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs of step {cur_step}",
+                    )
         # save also at the end of epoch
         try:
             if jax.process_index() == 0:
                 params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                model.save_pretrained(
+                    training_args.output_dir,
+                    params=params,
+                    push_to_hub=training_args.push_to_hub,
+                    commit_message=f"Saving weights and logs of epoch {epoch}",
+                )
         except:
             # push to hub fails the whole script if nothing new to commit
+            pass

start_train.sh CHANGED Viewed

@@ -17,7 +17,7 @@ python3 run_mlm_flax.py \
     --adam_beta2="0.98" \
     --adam_epsilon="1e-6" \
     --learning_rate="2e-4" \
-    --warmup_steps="25000" \
     --overwrite_output_dir \
     --num_train_epochs="2" \
     --save_strategy="steps" \
@@ -27,5 +27,4 @@ python3 run_mlm_flax.py \
     --logging_steps="1000" \
     --dtype="bfloat16" \
     --push_to_hub \
-    --hub_model_id="Finnish-NLP/roberta-large-finnish-v2" \
-    --adafactor

     --adam_beta2="0.98" \
     --adam_epsilon="1e-6" \
     --learning_rate="2e-4" \
+    --warmup_steps="1500" \
     --overwrite_output_dir \
     --num_train_epochs="2" \
     --save_strategy="steps" \
     --logging_steps="1000" \
     --dtype="bfloat16" \
     --push_to_hub \
+    --hub_model_id="Finnish-NLP/roberta-large-finnish-v2"