dat commited on Jul 14, 2021

Commit

f291f93

1 Parent(s): f6e0bf7

Saving weights and logs at step 1252

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Load data & train tokenizer.ipynb +0 -0
checkpoint_60000 +3 -0
events.out.tfevents.1626173264.t1v-n-f5c06ea1-w-0.340852.3.v2 +3 -0
events.out.tfevents.1626174131.t1v-n-f5c06ea1-w-0.343920.3.v2 +3 -0
events.out.tfevents.1626174670.t1v-n-f5c06ea1-w-0.346512.3.v2 +3 -0
events.out.tfevents.1626175237.t1v-n-f5c06ea1-w-0.349243.3.v2 +3 -0
events.out.tfevents.1626176074.t1v-n-f5c06ea1-w-0.351681.3.v2 +3 -0
events.out.tfevents.1626180467.t1v-n-f5c06ea1-w-0.354027.3.v2 +3 -0
events.out.tfevents.1626180750.t1v-n-f5c06ea1-w-0.355855.3.v2 +3 -0
events.out.tfevents.1626181600.t1v-n-f5c06ea1-w-0.357816.3.v2 +3 -0
events.out.tfevents.1626181889.t1v-n-f5c06ea1-w-0.360037.3.v2 +3 -0
events.out.tfevents.1626182175.t1v-n-f5c06ea1-w-0.362298.3.v2 +3 -0
events.out.tfevents.1626182874.t1v-n-f5c06ea1-w-0.365284.3.v2 +3 -0
events.out.tfevents.1626184460.t1v-n-f5c06ea1-w-0.369028.3.v2 +3 -0
events.out.tfevents.1626242600.t1v-n-f5c06ea1-w-0.491835.3.v2 +3 -0
events.out.tfevents.1626285315.t1v-n-f5c06ea1-w-0.533662.3.v2 +3 -0
events.out.tfevents.1626286793.t1v-n-f5c06ea1-w-0.547087.3.v2 +3 -0
events.out.tfevents.1626287584.t1v-n-f5c06ea1-w-0.550207.3.v2 +3 -0
events.out.tfevents.1626288936.t1v-n-f5c06ea1-w-0.553832.3.v2 +3 -0
events.out.tfevents.1626290714.t1v-n-f5c06ea1-w-0.557554.3.v2 +3 -0
events.out.tfevents.1626292080.t1v-n-f5c06ea1-w-0.560928.3.v2 +3 -0
events.out.tfevents.1626292866.t1v-n-f5c06ea1-w-0.563390.3.v2 +3 -0
events.out.tfevents.1626293250.t1v-n-f5c06ea1-w-0.565261.3.v2 +3 -0
events.out.tfevents.1626294676.t1v-n-f5c06ea1-w-0.568447.3.v2 +3 -0
events.out.tfevents.1626295212.t1v-n-f5c06ea1-w-0.570637.3.v2 +3 -0
events.out.tfevents.1626296457.t1v-n-f5c06ea1-w-0.573688.3.v2 +3 -0
events.out.tfevents.1626296630.t1v-n-f5c06ea1-w-0.575437.3.v2 +3 -0
flax_model.msgpack +2 -2
run.sh +11 -9
run_mlm_flax.py +270 -218
run_mlm_flax_no_accum.py +776 -0
save_tokenized_data.py +484 -0
train_tokenizer.py +43 -0
wandb/debug-internal.log +1 -1
wandb/debug.log +1 -1
wandb/latest-run +1 -1
wandb/run-20210713_010630-14xhiyhf/files/output.log +9 -0
wandb/run-20210713_010630-14xhiyhf/logs/debug-internal.log +24 -0
wandb/run-20210713_010630-14xhiyhf/logs/debug.log +2 -0
wandb/run-20210713_010630-14xhiyhf/run-14xhiyhf.wandb +0 -0
wandb/run-20210713_104745-1rl2j7or/files/config.yaml +304 -0
wandb/run-20210713_104745-1rl2j7or/files/output.log +57 -0
wandb/run-20210713_104745-1rl2j7or/files/requirements.txt +92 -0
wandb/run-20210713_104745-1rl2j7or/files/wandb-metadata.json +44 -0
wandb/run-20210713_104745-1rl2j7or/files/wandb-summary.json +1 -0
wandb/run-20210713_104745-1rl2j7or/logs/debug-internal.log +181 -0
wandb/run-20210713_104745-1rl2j7or/logs/debug.log +27 -0
wandb/run-20210713_104745-1rl2j7or/run-1rl2j7or.wandb +0 -0
wandb/run-20210713_110212-594z6oo0/files/config.yaml +307 -0
wandb/run-20210713_110212-594z6oo0/files/output.log +39 -0

Load data & train tokenizer.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

checkpoint_60000 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73e6d7222b2cee297be0891db385dcce6e0cbff6ec3697c08118513955f8aaf7
+size 769729450

events.out.tfevents.1626173264.t1v-n-f5c06ea1-w-0.340852.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73fdfc3eb9d8111b1e3460227717a3942adfe9263bca08b7fd2bfab9af98d9a1
+size 38186

events.out.tfevents.1626174131.t1v-n-f5c06ea1-w-0.343920.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfc6f0b5b354bd4d8d13834613ece71ac9d948186313bc3fde5e2e132a1c9cab
+size 40

events.out.tfevents.1626174670.t1v-n-f5c06ea1-w-0.346512.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f74cf77c0a672ad1201614ba6642a4f3a27b9cf021d0e88eb362c7f38ee86304
+size 40

events.out.tfevents.1626175237.t1v-n-f5c06ea1-w-0.349243.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be5c2acf821fd2ce776ff5e434706cb933a0fa323f0bb1a82dadd832f1f589d4
+size 40

events.out.tfevents.1626176074.t1v-n-f5c06ea1-w-0.351681.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b085d5029d052defe00b26c54b6357e9d05cbc5ad38cdd2f12537ed0b90008d2
+size 441341

events.out.tfevents.1626180467.t1v-n-f5c06ea1-w-0.354027.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:973eec9b2b17e54f3ee35dc0c4b85a4a3ecf5488cb59f5619d7c635641bfe7b6
+size 40

events.out.tfevents.1626180750.t1v-n-f5c06ea1-w-0.355855.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:013fc500b7fdd46262ee2b2ed5a3624249adef426d0b134944080ccf90d363ed
+size 40

events.out.tfevents.1626181600.t1v-n-f5c06ea1-w-0.357816.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3d4a519b8f1c293258e292768822980b487ef0e02bbfe9d6a3132b8c2fdd791
+size 40

events.out.tfevents.1626181889.t1v-n-f5c06ea1-w-0.360037.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c1ed9142ba98f2f7197e2a44361331a8c112af5dba98d7fc9f0bcab6228ae8c
+size 40

events.out.tfevents.1626182175.t1v-n-f5c06ea1-w-0.362298.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29cc2c143c306c4619802094513459dbb71c4730d3cdfb879e7224923ddfe7ea
+size 40

events.out.tfevents.1626182874.t1v-n-f5c06ea1-w-0.365284.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24aa4302db5d02121389fc7f8944025588034aedd21f772c2b71224e3a0b0d13
+size 220634

events.out.tfevents.1626184460.t1v-n-f5c06ea1-w-0.369028.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e5631bf443386a4e37d77053e55ba4517153d5f6d7f77b616258d9c78e6901f
+size 367772

events.out.tfevents.1626242600.t1v-n-f5c06ea1-w-0.491835.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f94f6c2d80b0e0d6247997634649101caefa3ad8ab4f408b529ad38f86c8770
+size 40

events.out.tfevents.1626285315.t1v-n-f5c06ea1-w-0.533662.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29b681f16c441caf85381c9def58d19f4479a2460146d2cfb68991f8327f01fe
+size 40

events.out.tfevents.1626286793.t1v-n-f5c06ea1-w-0.547087.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53d63b11450875138751afac48c611f4da76fadc0affb0ec98896b35dbad9728
+size 40

events.out.tfevents.1626287584.t1v-n-f5c06ea1-w-0.550207.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62cc6dc4bf215d99f8685629bf632f82d65fc7f1127d876ded332b31b5432064
+size 40

events.out.tfevents.1626288936.t1v-n-f5c06ea1-w-0.553832.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1fccf6070edac76c190b8bb8de4e37b889dd1b18835777203f9d16ac658aaf71
+size 40

events.out.tfevents.1626290714.t1v-n-f5c06ea1-w-0.557554.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d46028802a38f383ce27081e90ff848e3da863ac08c341f101eed1b20a39556c
+size 40

events.out.tfevents.1626292080.t1v-n-f5c06ea1-w-0.560928.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2e89d0090ae1228c609a140c2a20fbdfb208480a0dd16aced968756947a93f0
+size 147065

events.out.tfevents.1626292866.t1v-n-f5c06ea1-w-0.563390.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b5607707732c41fb3bac9b56702cf2a006ba526d98638e0352ba54e809c6eff
+size 40

events.out.tfevents.1626293250.t1v-n-f5c06ea1-w-0.565261.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83bed69057844c7af14e165d87c9678d28135297ab5bd374d1e0d80ebd31966f
+size 221057

events.out.tfevents.1626294676.t1v-n-f5c06ea1-w-0.568447.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:050b6dc69ea5a9946fc01c76d67ea00913117399f1a37e0f24db39f39c52e76f
+size 73565

events.out.tfevents.1626295212.t1v-n-f5c06ea1-w-0.570637.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2818b40b384ff7f5a57fe1c4994ebbd02140f7221904f527cfc0a9a115334a79
+size 184532

events.out.tfevents.1626296457.t1v-n-f5c06ea1-w-0.573688.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df3d8a6aa5b0177a3c337963bad77cc5cea9ed722032941dbac474d03b5a3261
+size 40

events.out.tfevents.1626296630.t1v-n-f5c06ea1-w-0.575437.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:932b70a150d991f6939f853c7b54516d5309f2d6c19761fa96a50999bf2199e7
+size 147993

flax_model.msgpack CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:19dddbba6ad2a0aa9c5c22f1b9750b90fcd0b7c8f3007cbd6af9a17d447fa417
-size 256576390

 version https://git-lfs.github.com/spec/v1
+oid sha256:422812fccdda54c02543ac5e994b33b54e510e0474439fbe9360d5190787d38e
+size 510090043

run.sh CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
-export TOKENIZERS_PARALLELISM=0
 python ./run_mlm_flax.py \
     --push_to_hub \
@@ -14,18 +14,20 @@ python ./run_mlm_flax.py \
     --overwrite_output_dir \
     --adam_beta1="0.9" \
     --adam_beta2="0.98" \
-    --logging_steps="500" \
-    --eval_steps="92768" \
-    --num_train_epochs="5" \
-    --preprocessing_num_workers="64" \
-    --save_steps="20000" \
-    --learning_rate="5e-5" \
     --per_device_train_batch_size="2" \
     --per_device_eval_batch_size="2" \
     --save_total_limit="5"\
-    --gradient_accumulation_steps="2" \
     #--adafactor \
     #--dtype="bfloat16" \
-    #--resume_from_checkpoint="./"\

 #!/usr/bin/env bash
+#export TOKENIZERS_PARALLELISM=0
 python ./run_mlm_flax.py \
     --push_to_hub \
     --overwrite_output_dir \
     --adam_beta1="0.9" \
     --adam_beta2="0.98" \
+    --logging_steps="250" \
+    --eval_steps="500" \
+    --num_train_epochs="3" \
+    --preprocessing_num_workers="96" \
+    --save_steps="1250" \
+    --learning_rate="1e-4" \
     --per_device_train_batch_size="2" \
     --per_device_eval_batch_size="2" \
     --save_total_limit="5"\
+    --max_eval_samples="500"\
+    --overwrite_cache False \
+    --gradient_accumulation_steps="4" \
+    #--resume_from_checkpoint="./"\
     #--adafactor \
     #--dtype="bfloat16" \

run_mlm_flax.py CHANGED Viewed

@@ -20,20 +20,18 @@ text file or a dataset.
 Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 https://huggingface.co/models?filter=masked-lm
 """
-import shutil
 import logging
 import os
 import sys
 import time
 from dataclasses import dataclass, field
-from ast import Str
 # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import numpy as np
-from datasets import load_dataset
 from tqdm import tqdm
 import flax
@@ -56,13 +54,12 @@ from transformers import (
     is_tensorboard_available,
     set_seed,
 )
-from transformers.testing_utils import CaptureLogger
-from flax.serialization import to_bytes, from_bytes
-from importlib.util import find_spec
 from flax.training import checkpoints
 from flax.jax_utils import unreplicate
 from flax.training.checkpoints import save_checkpoint, restore_checkpoint
-import json
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
@@ -104,8 +101,10 @@ class ModelArguments:
             "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
         },
     )
 @dataclass
@@ -120,11 +119,6 @@ class DataTrainingArguments:
     dataset_config_name: Optional[str] = field(
         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
     train_ref_file: Optional[str] = field(
         default=None,
         metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
@@ -136,6 +130,9 @@ class DataTrainingArguments:
     overwrite_cache: bool = field(
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     validation_split_percentage: Optional[int] = field(
         default=5,
         metadata={
@@ -167,6 +164,17 @@ class DataTrainingArguments:
         default=False,
         metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
     )
 @flax.struct.dataclass
@@ -266,33 +274,73 @@ def write_eval_metric(summary_writer, eval_metrics, step):
     for metric_name, value in eval_metrics.items():
         summary_writer.scalar(f"eval_{metric_name}", value, step)
-def mb_item(x):
-    return x.item() if hasattr(x, "item") else x
-#checkpoint functions
-def rotate_checkpoints(ckpt_dir: str, save_total_limit: int):
-    "Removes older checkpoints so that `save_total_limit` checkpoints are kept"
-    # TODO: what to remove is decided using step number only, we might want to improve that
-    ckpts = [str(x) for x in Path(ckpt_dir).glob("ckpt-*")]
-    # sort checkpoints by step
-    ckpts_sorted = sorted(ckpts, key=lambda x: int(x.split('-')[-1]))
-    ckpts_to_delete = ckpts_sorted[:-save_total_limit]
-    for ckpt in ckpts_to_delete:
-        logger.info(f"Deleting older checkpoint [{ckpt}] due to save_total_limit ({save_total_limit})")
-        shutil.rmtree(ckpt)
-class TrainState(train_state.TrainState):
-    grad_accum: jnp.ndarray
 if __name__ == "__main__":
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
@@ -360,52 +408,70 @@ if __name__ == "__main__":
                 cache_dir=model_args.cache_dir,
             )
     else:
-        #data_files = {}
-        #if data_args.train_file is not None:
-        #    data_files["train"] = data_args.train_file
-        #if data_args.validation_file is not None:
-        #    data_files["validation"] = data_args.validation_file
-        #extension = data_args.train_file.split(".")[-1]
-        #if extension == "txt":
-        #    extension = "text"
-        #datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
-        #data_dir = "/home/yeb"
-        # data_dir = "/home/yeb/Developer/data"
         data_files = []
-        def train_val_files():
-            import glob
-            import random
-            SEED = 42
-            def add_jsonlines_dir(path):
-                global data_files
-                data_files += glob.glob(f"{path}/*.gz")
-            add_jsonlines_dir("/home/dat/subset_c4_cleannl")
-            add_jsonlines_dir("/data/oscar_nl_cleaned")
-            add_jsonlines_dir("/data/nrc_cleaned_idtextfmt")
-            add_jsonlines_dir("/data/nu_cleaned_idtextfmt")
-            random.Random(SEED).shuffle(data_files)
-            total = len(data_files)
-            val_size = int(0.05 * total)
-            train_size = total - val_size
-            print(f"95%: {train_size}")
-            train = data_files[:train_size]
-            val = data_files[train_size:]
-            print(f"Got {len(train)} training files and {len(val)} validation files")
-            assert list(set(train) & set(val)) == [], "Train overlaps with test"
-            return train, val
-        train, val = train_val_files()
-        datasets = load_dataset('json', data_files={'train': train, 'validation': val})
-        datasets["train"] = datasets["train"].select(range(int(0.8*len(datasets["train"]))))
-        datasets["validation"] = datasets["validation"].select(range(int(0.8*len(datasets["validation"]))))
-        #datasets["train"] = datasets["train"].select(range(10000))
-        #datasets["validation"] = datasets["validation"].select(range(10000))
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
     elif model_args.model_name_or_path:
@@ -430,90 +496,97 @@ if __name__ == "__main__":
     # Preprocessing the datasets.
     # First we tokenize all the texts.
-    if training_args.do_train:
-        column_names = datasets["train"].column_names
-    else:
-        column_names = datasets["validation"].column_names
-    text_column_name = "text" if "text" in column_names else column_names[0]
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-    if data_args.line_by_line:
-        # When using line_by_line, we just tokenize each nonempty line.
-        padding = "max_length" if data_args.pad_to_max_length else False
-        def tokenize_function(examples):
-            # Remove empty lines
-            examples = [line for line in examples if len(line) > 0 and not line.isspace()]
-            return tokenizer(
-                examples,
-                return_special_tokens_mask=True,
-                padding=padding,
-                truncation=True,
-                max_length=max_seq_length,
             )
-        tokenized_datasets = datasets.map(
-            tokenize_function,
-            input_columns=[text_column_name],
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-    else:
-        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
-        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
-        # efficient when it receives the `special_tokens_mask`.
-        def tokenize_function(examples):
-            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
-        tokenized_datasets = datasets.map(
-            tokenize_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
-        # max_seq_length.
-        def group_texts(examples):
-            # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
-            total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-            # customize this part to your needs.
-            if total_length >= max_seq_length:
-                total_length = (total_length // max_seq_length) * max_seq_length
-            # Split by chunks of max_len.
-            result = {
-                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
-                for k, t in concatenated_examples.items()
-            }
-            return result
-        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
-        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
-        # might be slower to preprocess.
-        #
-        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-        lm_datasets = tokenized_datasets.map(
-            group_texts,
-            batched=True,
-            batch_size=100,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-        train_dataset = lm_datasets["train"]
-        eval_dataset = lm_datasets["validation"]
     # Enable tensorboard only on the master node
     has_tensorboard = is_tensorboard_available()
     if has_tensorboard and jax.process_index() == 0:
@@ -531,7 +604,6 @@ if __name__ == "__main__":
             "Unable to display metrics through TensorBoard because the package is not installed: "
             "Please run pip install tensorboard to enable."
         )
-    # enable wandb tracking
     has_wandb = find_spec("wandb") is not None
     if jax.process_index() == 0 and has_wandb and ("wandb" in training_args.report_to):
         try:
@@ -547,7 +619,6 @@ if __name__ == "__main__":
         except ImportError as e:
             print(e)
             has_wandb = False
     # Data collator
     # This one will take care of randomly masking the tokens.
     data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
@@ -567,10 +638,10 @@ if __name__ == "__main__":
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
-    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
     eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
-    num_train_steps = len(train_dataset) // train_batch_size * num_epochs
     # Create learning rate schedule
     warmup_fn = optax.linear_schedule(
@@ -605,6 +676,7 @@ if __name__ == "__main__":
             learning_rate=linear_decay_lr_schedule_fn,
         )
     else:
         optimizer = optax.adamw(
             learning_rate=linear_decay_lr_schedule_fn,
             b1=training_args.adam_beta1,
@@ -613,22 +685,26 @@ if __name__ == "__main__":
             weight_decay=training_args.weight_decay,
             mask=decay_mask_fn,
         )
-    #if training_args.gradient_accumulation_steps > 1:
-    #    optimizer = optax.MultiSteps(optimizer, training_args.gradient_accumulation_steps)
-    #grad_accum_steps = training_args.gradient_accumulation_steps
     # Setup train state
-    state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer,grad_accum=jax.tree_map(jnp.zeros_like, model.params))
     if training_args.resume_from_checkpoint:
-        state = restore_checkpoint(training_args.resume_from_checkpoint, state)
-        resume_step = mb_item(state.step.item())
     else:
         resume_step = 0
     # Define gradient update step fn
     def train_step(state, batch, dropout_rng):
@@ -646,30 +722,17 @@ if __name__ == "__main__":
             # take average
             loss = loss.sum() / label_mask.sum()
-            return loss / training_args.gradient_accumulation_steps
         grad_fn = jax.value_and_grad(loss_fn)
-        loss, grads = grad_fn(state.params)
-        grad_accum = jax.tree_multimap(lambda x, y: x + y, grads, state.grad_accum)
-        def update_fn():
-            grads = jax.tree_map(lambda x: x / training_args.gradient_accumulation_steps, grad_accum)
-            grads = jax.lax.pmean(grad_accum, "batch")
-            new_state = state.apply_gradients(grads=grads,grad_accum=jax.tree_map(jnp.zeros_like, grads))
-            return new_state
-        new_state = jax.lax.cond(
-            state.step % training_args.gradient_accumulation_steps == 0,
-            lambda _: update_fn(),
-            lambda _: state.replace(grad_accum=grad_accum, step=state.step + 1),
-            None,
-        )
         metrics = jax.lax.pmean(
-            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch" #
         )
-        #return new_state.replace(new_dropout_rng=new_dropout_rng), metrics
         return new_state, metrics, new_dropout_rng
     # Create parallel version of the train step
@@ -700,7 +763,10 @@ if __name__ == "__main__":
     state = jax_utils.replicate(state)
     train_time = 0
-    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
     for epoch in epochs:
         # ======================== Training ================================
         train_start = time.time()
@@ -708,54 +774,53 @@ if __name__ == "__main__":
         # Create sampling rng
         rng, input_rng = jax.random.split(rng)
-        steps_per_epoch = len(train_dataset) // train_batch_size
         # Generate an epoch by shuffling sampling indices from the train dataset
-        num_train_samples = len(train_dataset)
         train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
-        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size) #// grad_accum_steps
         # Gather the indexes for creating the batch and do a training step
-        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1,initial=resume_step)): #grad_accum
-            samples = [train_dataset[int(idx)] for idx in batch_idx]
             model_inputs = data_collator(samples, pad_to_multiple_of=16)
             # Model forward
             model_inputs = shard(model_inputs.data)
             state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
             train_metrics.append(train_metric)
-            cur_step = epoch * (num_train_samples // train_batch_size) + step
             if cur_step < resume_step:
                 continue
-            if (cur_step % training_args.logging_steps) == 0 and cur_step > 0: # * grad_accum_steps
                 # Save metrics
                 train_metric = jax_utils.unreplicate(train_metric)
                 train_time += time.time() - train_start
                 if has_tensorboard and jax.process_index() == 0:
                     write_train_metric(summary_writer, train_metrics, train_time, cur_step)
                 if has_wandb and jax.process_index() == 0 and ("wandb" in training_args.report_to):
                     # TODO: add accumulation of metrics
                     _metrics = {k if k=="learning_rate" else f"train_{k}":mb_item(v.mean()) for k, v in train_metric.items()}
                     wandb.log({"training_step":cur_step, **_metrics}, commit=True)
                 epochs.write(
                     f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
                 )
                 train_metrics = []
-            if cur_step % (training_args.eval_steps) == 0 and cur_step > 0: #* grad_accum_steps
                 # ======================== Evaluating ==============================
-                num_eval_samples = len(eval_dataset)
                 eval_samples_idx = jnp.arange(num_eval_samples)
                 eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
                 eval_metrics = []
                 for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
-                    samples = [eval_dataset[int(idx)] for idx in batch_idx]
                     model_inputs = data_collator(samples, pad_to_multiple_of=16)
                     # Model forward
@@ -775,30 +840,17 @@ if __name__ == "__main__":
                 # Save metrics
                 if has_tensorboard and jax.process_index() == 0:
                     write_eval_metric(summary_writer, eval_metrics, cur_step)
                 if has_wandb and jax.process_index() == 0 and ("wandb" in training_args.report_to):
                     _metrics = {f"eval_{k}":mb_item(v) for k, v in eval_metrics.items()}
                     wandb.log({"eval_step":cur_step, **_metrics})
-            if (cur_step % training_args.save_steps == 0 ) and cur_step > 0: #
                 # save checkpoint after each epoch and push checkpoint to the hub
                 if jax.process_index() == 0:
-                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-                    model.save_pretrained(
-                        training_args.output_dir,
-                        params=params,
-                        push_to_hub=training_args.push_to_hub,
-                        commit_message=f"Saving weights and logs of step {cur_step}",
-                    )
-                    save_checkpoint(training_args.output_dir, jax_utils.unreplicate(state), cur_step, keep=training_args.save_total_limit, overwrite=True)
                     if training_args.save_total_limit is not None:
                         rotate_checkpoints(training_args.output_dir, training_args.save_total_limit)
     if jax.process_index() == 0:
-        params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-        model.save_pretrained(
-            training_args.output_dir,
-            params=params,
-            push_to_hub=training_args.push_to_hub,
-            commit_message=f"Saving weights and logs of step {cur_step}",
-            )

 Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 https://huggingface.co/models?filter=masked-lm
 """
 import logging
 import os
 import sys
 import time
 from dataclasses import dataclass, field
 # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import numpy as np
+from datasets import load_dataset, DatasetDict
 from tqdm import tqdm
 import flax
     is_tensorboard_available,
     set_seed,
 )
+import json
 from flax.training import checkpoints
 from flax.jax_utils import unreplicate
 from flax.training.checkpoints import save_checkpoint, restore_checkpoint
+from importlib.util import find_spec
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
             "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
         },
     )
+    save_optimizer: Optional[bool] = field(
+        default=True,
+        metadata={"help": "Whether to store full train state including optimizer."},
+    )
 @dataclass
     dataset_config_name: Optional[str] = field(
         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
     train_ref_file: Optional[str] = field(
         default=None,
         metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
     overwrite_cache: bool = field(
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     validation_split_percentage: Optional[int] = field(
         default=5,
         metadata={
         default=False,
         metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
     )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
 @flax.struct.dataclass
     for metric_name, value in eval_metrics.items():
         summary_writer.scalar(f"eval_{metric_name}", value, step)
+def _zeros_tree_like(inp_tree):
+    return jax.tree_map(jnp.zeros_like, inp_tree)
+def fake_update(state):
+    fake_updates = _zeros_tree_like(state.params)
+    _, new_inner_opt_state = state.tx.inner_opt.update(fake_updates, state.opt_state.inner_opt_state, state.params)
+    opt_state = state.opt_state
+    new_opt_state = optax.MultiStepsState(mini_step=opt_state.mini_step,
+                                        gradient_step=opt_state.gradient_step,
+                                        inner_opt_state=new_inner_opt_state,
+                                        acc_grads=opt_state.acc_grads)
+    return state.replace(opt_state=new_opt_state)
+def reinstantiate_states(opt_state):
+    new_state = []
+    for state in opt_state:
+        cls = getattr(optax, type(state).__name__)
+        new_state.append(cls(**{k:getattr(state, k) for k in state._fields}))
+    return new_state
+def restore_model_checkpoint(save_dir, state):
+    logger.info(f"RESTORING CHECKPOINT FROM {save_dir}...")
+    with open(os.path.join(save_dir, "flax_model.msgpack"), "rb") as f:
+        params = from_bytes(state.params, f.read())
+    with open(os.path.join(save_dir, "opt_state.msgpack"), "rb") as f:
+        opt_state = from_bytes(state.opt_state, f.read())
+    with open(os.path.join(save_dir, "training_state.json"), "r") as f:
+        training_state = json.load(f)
+    step = training_state["step"]
+    logger.info("checkpoint restored")
+    # reinstantiate inner opt state to avoid type conflict
+    if hasattr(opt_state, "inner_opt_state"):
+        print("restoring state of multisteps optimizer")
+        inner_opt_state = reinstantiate_states(opt_state.inner_opt_state)
+        ms_state_dict = {k:getattr(state.opt_state, k) for k in state.opt_state._fields}
+        ms_state_dict["inner_opt_state"] = inner_opt_state
+        opt_state = optax.MultiStepsState(**ms_state_dict)
+    return state.replace(step=step, params=params, opt_state=opt_state)
+def save_model_checkpoint(model, save_dir, state, with_opt:bool=True, push_to_hub:bool=False):
+    """
+    If `push_to_hub` is True, will save to `save_dir`. Otherwise will save to `save_dir/ckpt-{step}`.
+    """
+    state = jax_utils.unreplicate(state)
+    logger.info(f"SAVING CHECKPOINT IN {save_dir}...")
+    if not push_to_hub:
+        save_dir = f"{save_dir}/ckpt-{mb_item(state.step)-1}"
+    model.save_pretrained(
+        save_dir,
+        params=state.params,
+        push_to_hub=push_to_hub,
+        commit_message=f"Saving weights and logs at step {mb_item(state.step)-1}",
+    )
+    if with_opt:
+        with open(os.path.join(save_dir, "opt_state.msgpack"), "wb") as f:
+            f.write(to_bytes(state.opt_state))
+        with open(os.path.join(save_dir, "training_state.json"), "w") as f:
+            json.dump({"step": state.step.item()}, f)
+    logger.info("checkpoint saved")
 if __name__ == "__main__":
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
                 cache_dir=model_args.cache_dir,
             )
     else:
+        import glob
+        import random
         data_files = []
+        def add_jsonlines_dir(path, filespec):
+            global data_files
+            data_files += glob.glob(f"{path}/{filespec}")
+            data_files = list(set(data_files))
+            print(f"Number of files {len(data_files)} after adding {path} glob {filespec}")
+        add_jsonlines_dir(f"/data/c4_cleaned2", "*.gz")
+        add_jsonlines_dir(f"/data/nrc_uniq_cleaned_20210223", "*.gz")
+        add_jsonlines_dir(f"/data/nu_uniq_cleaned_20210225", "*.gz")
+        random.Random(42).shuffle(data_files)
+        total = len(data_files)
+        print(total)
+        perc = 0.05
+        val_size = int(perc * total)
+        train_size = total - val_size
+        train = data_files[:train_size]
+        val = data_files[train_size:]
+        print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
+        assert list(set(train) & set(val)) == [], "Train overlaps with test"
+        load_grouped = True
+        if not load_grouped:
+            datasets = load_dataset('json', data_files={'train': train, 'validation': val})
+    #from datasets import Dataset
+    #dataset = Dataset.from_file("/home/dat/.cache/huggingface/datasets/json/default-9add402b38836560/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723/json-train.arrow")
+    #dataset = Dataset.from_file("/home/dat/.cache/huggingface/datasets/json/default-9add402b38836560/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723/json-validation.arrow")
+    def mb_item(x):
+        return x.item() if hasattr(x, "item") else x
+    def save_model_checkpoint(model, save_dir, state, with_opt:bool=True, push_to_hub:bool=False):
+        """
+        If `push_to_hub` is True, will save to `save_dir`. Otherwise will save to `save_dir/ckpt-{step}`.
+        """
+        state = jax_utils.unreplicate(state)
+        logger.info(f"SAVING CHECKPOINT IN {save_dir}...")
+        if not push_to_hub:
+            save_dir = f"{save_dir}/ckpt-{mb_item(state.step)-1}"
+        model.save_pretrained(
+            save_dir,
+            params=state.params,
+            push_to_hub=push_to_hub,
+            commit_message=f"Saving weights and logs at step {mb_item(state.step)-1}",
+        )
+        if with_opt:
+            with open(os.path.join(save_dir, "opt_state.msgpack"), "wb") as f:
+                f.write(to_bytes(state.opt_state))
+            with open(os.path.join(save_dir, "training_state.json"), "w") as f:
+                json.dump({"step": state.step.item()}, f)
+        logger.info("checkpoint saved")
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
     elif model_args.model_name_or_path:
     # Preprocessing the datasets.
     # First we tokenize all the texts.
+    if load_grouped:
+        logger.info("Loading tokenized and grouped dataset")
+        tokenized_datasets = DatasetDict.load_from_disk("/data/tokenized_data")
+        logger.info("Setting max validation examples to ")
+        print(f"Number of validation examples {data_args.max_eval_samples}")
+        tokenized_datasets["train"]= tokenized_datasets["train"].select(range(20000))
+        if data_args.max_eval_samples is not None:
+            tokenized_datasets["validation"] = tokenized_datasets["validation"].select(range(data_args.max_eval_samples))
+    else:
+        if training_args.do_train:
+            column_names = datasets["train"].column_names
+        else:
+            column_names = datasets["validation"].column_names
+        text_column_name = "text" if "text" in column_names else column_names[0]
+        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+        if data_args.line_by_line:
+            # When using line_by_line, we just tokenize each nonempty line.
+            padding = "max_length" if data_args.pad_to_max_length else False
+            def tokenize_function(examples):
+                # Remove empty lines
+                examples = [line for line in examples if len(line) > 0 and not line.isspace()]
+                return tokenizer(
+                    examples,
+                    return_special_tokens_mask=True,
+                    padding=padding,
+                    truncation=True,
+                    max_length=max_seq_length,
+                )
+            tokenized_datasets = datasets.map(
+                tokenize_function,
+                input_columns=[text_column_name],
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
             )
+        else:
+            # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+            # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+            # efficient when it receives the `special_tokens_mask`.
+            def tokenize_function(examples):
+                return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+            tokenized_datasets = datasets.map(
+                tokenize_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+            )
+            # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+            # max_seq_length.
+            def group_texts(examples):
+                # Concatenate all texts.
+                concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+                total_length = len(concatenated_examples[list(examples.keys())[0]])
+                # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+                # customize this part to your needs.
+                if total_length >= max_seq_length:
+                    total_length = (total_length // max_seq_length) * max_seq_length
+                # Split by chunks of max_len.
+                result = {
+                    k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                    for k, t in concatenated_examples.items()
+                }
+                return result
+            # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+            # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+            # might be slower to preprocess.
+            #
+            # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+            tokenized_datasets = tokenized_datasets.map(
+                group_texts,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+            )
+            #tokenized_datasets.save_to_disk("/data/tokenized_data")
+            #print ("tokenized_datasets saved to disk")
     # Enable tensorboard only on the master node
     has_tensorboard = is_tensorboard_available()
     if has_tensorboard and jax.process_index() == 0:
             "Unable to display metrics through TensorBoard because the package is not installed: "
             "Please run pip install tensorboard to enable."
         )
     has_wandb = find_spec("wandb") is not None
     if jax.process_index() == 0 and has_wandb and ("wandb" in training_args.report_to):
         try:
         except ImportError as e:
             print(e)
             has_wandb = False
     # Data collator
     # This one will take care of randomly masking the tokens.
     data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() * training_args.gradient_accumulation_steps
     eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
     # Create learning rate schedule
     warmup_fn = optax.linear_schedule(
             learning_rate=linear_decay_lr_schedule_fn,
         )
     else:
+        from optax import clip_by_global_norm
         optimizer = optax.adamw(
             learning_rate=linear_decay_lr_schedule_fn,
             b1=training_args.adam_beta1,
             weight_decay=training_args.weight_decay,
             mask=decay_mask_fn,
         )
+        optimizer = optax.chain(
+            optax.clip_by_global_norm(1.),
+            optimizer
+        )
+    if training_args.gradient_accumulation_steps > 1:
+        optimizer = optax.MultiSteps(optimizer, training_args.gradient_accumulation_steps)
+    grad_accum_steps = training_args.gradient_accumulation_steps
     # Setup train state
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
     if training_args.resume_from_checkpoint:
+        state = restore_model_checkpoint(training_args.resume_from_checkpoint, state)
+        resume_step = mb_item(state.step)
+        if training_args.adafactor:
+            state = fake_update(state)
     else:
         resume_step = 0
     # Define gradient update step fn
     def train_step(state, batch, dropout_rng):
             # take average
             loss = loss.sum() / label_mask.sum()
+            return loss
         grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
         metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step // grad_accum_steps)}, axis_name="batch"
         )
         return new_state, metrics, new_dropout_rng
     # Create parallel version of the train step
     state = jax_utils.replicate(state)
     train_time = 0
+    steps_per_epoch = len(tokenized_datasets["train"]) // train_batch_size
+    resume_epoch = resume_step // (steps_per_epoch * grad_accum_steps)
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... ({resume_epoch+1}/{num_epochs})", position=0)
+    logger.info(f"Skipping to epoch {resume_epoch} step {resume_step // grad_accum_steps}")
     for epoch in epochs:
         # ======================== Training ================================
         train_start = time.time()
         # Create sampling rng
         rng, input_rng = jax.random.split(rng)
         # Generate an epoch by shuffling sampling indices from the train dataset
+        num_train_samples = len(tokenized_datasets["train"])
         train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
+        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size // grad_accum_steps)
         # Gather the indexes for creating the batch and do a training step
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1,initial=resume_step // grad_accum_steps)):
+            samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
             model_inputs = data_collator(samples, pad_to_multiple_of=16)
             # Model forward
             model_inputs = shard(model_inputs.data)
             state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
             train_metrics.append(train_metric)
+            cur_step = epoch * (num_train_samples // train_batch_size * grad_accum_steps) + step
             if cur_step < resume_step:
                 continue
+            if cur_step % training_args.logging_steps * grad_accum_steps == 0 and cur_step > 0:
                 # Save metrics
                 train_metric = jax_utils.unreplicate(train_metric)
                 train_time += time.time() - train_start
                 if has_tensorboard and jax.process_index() == 0:
                     write_train_metric(summary_writer, train_metrics, train_time, cur_step)
                 if has_wandb and jax.process_index() == 0 and ("wandb" in training_args.report_to):
                     # TODO: add accumulation of metrics
                     _metrics = {k if k=="learning_rate" else f"train_{k}":mb_item(v.mean()) for k, v in train_metric.items()}
                     wandb.log({"training_step":cur_step, **_metrics}, commit=True)
                 epochs.write(
                     f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
                 )
                 train_metrics = []
+            if cur_step % training_args.eval_steps * grad_accum_steps == 0 and cur_step > 0:
                 # ======================== Evaluating ==============================
+                num_eval_samples = len(tokenized_datasets["validation"])
                 eval_samples_idx = jnp.arange(num_eval_samples)
                 eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
                 eval_metrics = []
                 for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+                    samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
                     model_inputs = data_collator(samples, pad_to_multiple_of=16)
                     # Model forward
                 # Save metrics
                 if has_tensorboard and jax.process_index() == 0:
                     write_eval_metric(summary_writer, eval_metrics, cur_step)
                 if has_wandb and jax.process_index() == 0 and ("wandb" in training_args.report_to):
                     _metrics = {f"eval_{k}":mb_item(v) for k, v in eval_metrics.items()}
                     wandb.log({"eval_step":cur_step, **_metrics})
+            if cur_step % training_args.save_steps == 0 * grad_accum_steps and cur_step > 0:
                 # save checkpoint after each epoch and push checkpoint to the hub
                 if jax.process_index() == 0:
+                    save_model_checkpoint(model, training_args.output_dir, state, with_opt=model_args.save_optimizer,
+                                      push_to_hub=training_args.push_to_hub)
                     if training_args.save_total_limit is not None:
                         rotate_checkpoints(training_args.output_dir, training_args.save_total_limit)
     if jax.process_index() == 0:
+        save_model_checkpoint(model, training_args.output_dir, state, with_opt=model_args.save_optimizer, push_to_hub=training_args.push_to_hub)

run_mlm_flax_no_accum.py ADDED Viewed

	@@ -0,0 +1,776 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
+text file or a dataset.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=masked-lm
+"""
+import logging
+import os
+import sys
+import time
+from dataclasses import dataclass, field
+# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from datasets import load_dataset, DatasetDict
+from tqdm import tqdm
+import flax
+import jax
+import jax.numpy as jnp
+import optax
+from flax import jax_utils, traverse_util
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
+    AutoConfig,
+    AutoTokenizer,
+    FlaxAutoModelForMaskedLM,
+    HfArgumentParser,
+    PreTrainedTokenizerBase,
+    TensorType,
+    TrainingArguments,
+    is_tensorboard_available,
+    set_seed,
+)
+import json
+from flax.training import checkpoints
+from flax.jax_utils import unreplicate
+from flax.training.checkpoints import save_checkpoint, restore_checkpoint
+from importlib.util import find_spec
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
+    )
+    validation_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated. Default to the max input length of the model."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+@flax.struct.dataclass
+class FlaxDataCollatorForLanguageModeling:
+    """
+    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
+    are not all of the same length.
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
+            The probability with which to (randomly) mask tokens in the input.
+    .. note::
+        For best performance, this data collator should be used with a dataset having items that are dictionaries or
+        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
+        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
+        argument :obj:`return_special_tokens_mask=True`.
+    """
+    tokenizer: PreTrainedTokenizerBase
+    mlm_probability: float = 0.15
+    def __post_init__(self):
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                "You should pass `mlm=False` to train on causal language modeling instead."
+            )
+    def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        batch["input_ids"], batch["labels"] = self.mask_tokens(
+            batch["input_ids"], special_tokens_mask=special_tokens_mask
+        )
+        return batch
+    def mask_tokens(
+        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
+    ) -> Tuple[jnp.ndarray, jnp.ndarray]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = inputs.copy()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = np.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = special_tokens_mask.astype("bool")
+        probability_matrix[special_tokens_mask] = 0.0
+        masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
+        indices_random &= masked_indices & ~indices_replaced
+        random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
+        inputs[indices_random] = random_words[indices_random]
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
+    num_samples = len(samples_idx)
+    samples_to_remove = num_samples % batch_size
+    if samples_to_remove != 0:
+        samples_idx = samples_idx[:-samples_to_remove]
+    sections_split = num_samples // batch_size
+    batch_idx = np.split(samples_idx, sections_split)
+    return batch_idx
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+def write_eval_metric(summary_writer, eval_metrics, step):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+def rotate_checkpoints(ckpt_dir:str, save_total_limit:int):
+    "Removes older checkpoints so that `save_total_limit` checkpoints are kept"
+    # TODO: what to remove is decided using step number only, we might want to improve that
+    ckpts = [str(x) for x in Path(ckpt_dir).glob("ckpt-*")]
+    # sort checkpoints by step
+    ckpts_sorted = sorted(ckpts, key=lambda x: int(x.split('-')[-1]))
+    ckpts_to_delete = ckpts_sorted[:-save_total_limit]
+    for ckpt in ckpts_to_delete:
+        logger.info(f"Deleting older checkpoint [{ckpt}] due to save_total_limit ({save_total_limit})")
+        shutil.rmtree(ckpt)
+if __name__ == "__main__":
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        level="NOTSET",
+        datefmt="[%X]",
+    )
+    # Log on each process the small summary:
+    logger = logging.getLogger(__name__)
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    else:
+        import glob
+        import random
+        data_files = []
+        def add_jsonlines_dir(path, filespec):
+            global data_files
+            data_files += glob.glob(f"{path}/{filespec}")
+            data_files = list(set(data_files))
+            print(f"Number of files {len(data_files)} after adding {path} glob {filespec}")
+        add_jsonlines_dir(f"/data/c4_cleaned2", "*.gz")
+        add_jsonlines_dir(f"/data/nrc_uniq_cleaned_20210223", "*.gz")
+        add_jsonlines_dir(f"/data/nu_uniq_cleaned_20210225", "*.gz")
+        random.Random(42).shuffle(data_files)
+        total = len(data_files)
+        print(total)
+        perc = 0.05
+        val_size = int(perc * total)
+        train_size = total - val_size
+        train = data_files[:train_size]
+        val = data_files[train_size:]
+        print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
+        assert list(set(train) & set(val)) == [], "Train overlaps with test"
+        load_grouped = True
+        if not load_grouped:
+            datasets = load_dataset('json', data_files={'train': train, 'validation': val})
+    #from datasets import Dataset
+    #dataset = Dataset.from_file("/home/dat/.cache/huggingface/datasets/json/default-9add402b38836560/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723/json-train.arrow")
+    #dataset = Dataset.from_file("/home/dat/.cache/huggingface/datasets/json/default-9add402b38836560/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723/json-validation.arrow")
+    def mb_item(x):
+        return x.item() if hasattr(x, "item") else x
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if load_grouped:
+        logger.info("Loading tokenized and grouped dataset")
+        tokenized_datasets = DatasetDict.load_from_disk("/data/tokenized_data")
+        logger.info("Setting max validation examples to ")
+        print(f"Number of validation examples {data_args.max_eval_samples}")
+        tokenized_datasets["train"]= tokenized_datasets["train"].select(range(20000))
+        if data_args.max_eval_samples is not None:
+            tokenized_datasets["validation"] = tokenized_datasets["validation"].select(range(data_args.max_eval_samples))
+    else:
+        if training_args.do_train:
+            column_names = datasets["train"].column_names
+        else:
+            column_names = datasets["validation"].column_names
+        text_column_name = "text" if "text" in column_names else column_names[0]
+        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+        if data_args.line_by_line:
+            # When using line_by_line, we just tokenize each nonempty line.
+            padding = "max_length" if data_args.pad_to_max_length else False
+            def tokenize_function(examples):
+                # Remove empty lines
+                examples = [line for line in examples if len(line) > 0 and not line.isspace()]
+                return tokenizer(
+                    examples,
+                    return_special_tokens_mask=True,
+                    padding=padding,
+                    truncation=True,
+                    max_length=max_seq_length,
+                )
+            tokenized_datasets = datasets.map(
+                tokenize_function,
+                input_columns=[text_column_name],
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+            )
+        else:
+            # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+            # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+            # efficient when it receives the `special_tokens_mask`.
+            def tokenize_function(examples):
+                return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+            tokenized_datasets = datasets.map(
+                tokenize_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+            )
+            # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+            # max_seq_length.
+            def group_texts(examples):
+                # Concatenate all texts.
+                concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+                total_length = len(concatenated_examples[list(examples.keys())[0]])
+                # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+                # customize this part to your needs.
+                if total_length >= max_seq_length:
+                    total_length = (total_length // max_seq_length) * max_seq_length
+                # Split by chunks of max_len.
+                result = {
+                    k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                    for k, t in concatenated_examples.items()
+                }
+                return result
+            # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+            # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+            # might be slower to preprocess.
+            #
+            # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+            tokenized_datasets = tokenized_datasets.map(
+                group_texts,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+            )
+            #tokenized_datasets.save_to_disk("/data/tokenized_data")
+            #print ("tokenized_datasets saved to disk")
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+    has_wandb = find_spec("wandb") is not None
+    if jax.process_index() == 0 and has_wandb and ("wandb" in training_args.report_to):
+        try:
+            import wandb
+            wandb.init(
+                entity="wandb",
+                project="hf-flax-pino-roberta",
+                sync_tensorboard=True
+            )
+            wandb.config.update(training_args)
+            wandb.config.update(model_args)
+            wandb.config.update(data_args)
+        except ImportError as e:
+            print(e)
+            has_wandb = False
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForMaskedLM.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxAutoModelForMaskedLM.from_config(
+            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
+    # Create learning rate schedule
+    warmup_fn = optax.linear_schedule(
+        init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
+    )
+    decay_fn = optax.linear_schedule(
+        init_value=training_args.learning_rate,
+        end_value=0,
+        transition_steps=num_train_steps - training_args.warmup_steps,
+    )
+    linear_decay_lr_schedule_fn = optax.join_schedules(
+        schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBERT-like models.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )
+        optimizer = optax.chain(
+            optax.clip_grad_by_global_norm(1.),
+            optimizer
+        )
+    # Setup train state
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
+    if training_args.resume_from_checkpoint:
+        state = restore_checkpoint(training_args.resume_from_checkpoint, state)
+        resume_step = mb_item(state.step.item())
+    else:
+        resume_step = 0
+    # Define gradient update step fn
+    def train_step(state, batch, dropout_rng):
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+        def loss_fn(params):
+            labels = batch.pop("labels")
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            # compute loss, ignore padded input tokens
+            label_mask = jnp.where(labels > 0, 1.0, 0.0)
+            loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+            # take average
+            loss = loss.sum() / label_mask.sum()
+            return loss
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+        metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
+        )
+        return new_state, metrics, new_dropout_rng
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+        # compute loss, ignore padded input tokens
+        label_mask = jnp.where(labels > 0, 1.0, 0.0)
+        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+        # compute accuracy
+        accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) * label_mask
+        # summarize metrics
+        metrics = {"loss": loss.sum(), "accuracy": accuracy.sum(), "normalizer": label_mask.sum()}
+        metrics = jax.lax.psum(metrics, axis_name="batch")
+        return metrics
+    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    train_time = 0
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+        train_start = time.time()
+        train_metrics = []
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+        # Generate an epoch by shuffling sampling indices from the train dataset
+        num_train_samples = len(tokenized_datasets["train"])
+        train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
+        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
+        # Gather the indexes for creating the batch and do a training step
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1,initial=resume_step)):
+            samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+            train_metrics.append(train_metric)
+            cur_step = epoch * (num_train_samples // train_batch_size) + step
+            if cur_step < resume_step:
+                continue
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = jax_utils.unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+                if has_wandb and jax.process_index() == 0 and ("wandb" in training_args.report_to):
+                    # TODO: add accumulation of metrics
+                    _metrics = {k if k=="learning_rate" else f"train_{k}":mb_item(v.mean()) for k, v in train_metric.items()}
+                    wandb.log({"training_step":cur_step, **_metrics}, commit=True)
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+                )
+                train_metrics = []
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                num_eval_samples = len(tokenized_datasets["validation"])
+                eval_samples_idx = jnp.arange(num_eval_samples)
+                eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+                eval_metrics = []
+                for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+                    samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+                    model_inputs = data_collator(samples, pad_to_multiple_of=16)
+                    # Model forward
+                    model_inputs = shard(model_inputs.data)
+                    metrics = p_eval_step(state.params, model_inputs)
+                    eval_metrics.append(metrics)
+                # normalize eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
+                eval_normalizer = eval_metrics.pop("normalizer")
+                eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+                # Update progress bar
+                epochs.desc = f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+                if has_wandb and jax.process_index() == 0 and ("wandb" in training_args.report_to):
+                    _metrics = {f"eval_{k}":mb_item(v) for k, v in eval_metrics.items()}
+                    wandb.log({"eval_step":cur_step, **_metrics})
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    save_checkpoint(training_args.output_dir, jax_utils.unreplicate(state), cur_step, keep=training_args.save_total_limit, overwrite=True)
+                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs of step {cur_step}",
+                    )
+                    if training_args.save_total_limit is not None:
+                        rotate_checkpoints(training_args.output_dir, training_args.save_total_limit)
+    if jax.process_index() == 0:
+        params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+        model.save_pretrained(
+            training_args.output_dir,
+            params=params,
+            push_to_hub=training_args.push_to_hub,
+            commit_message=f"Saving weights and logs of step {cur_step}",
+            )

save_tokenized_data.py ADDED Viewed

	@@ -0,0 +1,484 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
+text file or a dataset.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=masked-lm
+"""
+import logging
+import os
+import sys
+import time
+from dataclasses import dataclass, field
+# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from datasets import load_dataset
+from tqdm import tqdm
+import flax
+import jax
+import jax.numpy as jnp
+import optax
+from flax import jax_utils, traverse_util
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
+    AutoConfig,
+    AutoTokenizer,
+    FlaxAutoModelForMaskedLM,
+    HfArgumentParser,
+    PreTrainedTokenizerBase,
+    TensorType,
+    TrainingArguments,
+    is_tensorboard_available,
+    set_seed,
+)
+import json
+from flax.training import checkpoints
+from flax.jax_utils import unreplicate
+from flax.training.checkpoints import save_checkpoint, restore_checkpoint
+from importlib.util import find_spec
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
+    )
+    validation_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated. Default to the max input length of the model."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+@flax.struct.dataclass
+class FlaxDataCollatorForLanguageModeling:
+    """
+    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
+    are not all of the same length.
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
+            The probability with which to (randomly) mask tokens in the input.
+    .. note::
+        For best performance, this data collator should be used with a dataset having items that are dictionaries or
+        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
+        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
+        argument :obj:`return_special_tokens_mask=True`.
+    """
+    tokenizer: PreTrainedTokenizerBase
+    mlm_probability: float = 0.15
+    def __post_init__(self):
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                "You should pass `mlm=False` to train on causal language modeling instead."
+            )
+    def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        batch["input_ids"], batch["labels"] = self.mask_tokens(
+            batch["input_ids"], special_tokens_mask=special_tokens_mask
+        )
+        return batch
+    def mask_tokens(
+        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
+    ) -> Tuple[jnp.ndarray, jnp.ndarray]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = inputs.copy()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = np.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = special_tokens_mask.astype("bool")
+        probability_matrix[special_tokens_mask] = 0.0
+        masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
+        indices_random &= masked_indices & ~indices_replaced
+        random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
+        inputs[indices_random] = random_words[indices_random]
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
+    num_samples = len(samples_idx)
+    samples_to_remove = num_samples % batch_size
+    if samples_to_remove != 0:
+        samples_idx = samples_idx[:-samples_to_remove]
+    sections_split = num_samples // batch_size
+    batch_idx = np.split(samples_idx, sections_split)
+    return batch_idx
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+def write_eval_metric(summary_writer, eval_metrics, step):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+if __name__ == "__main__":
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        level="NOTSET",
+        datefmt="[%X]",
+    )
+    # Log on each process the small summary:
+    logger = logging.getLogger(__name__)
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    else:
+        import glob
+        import random
+        data_files = []
+        def add_jsonlines_dir(path, filespec):
+            global data_files
+            data_files += glob.glob(f"{path}/{filespec}")
+            data_files = list(set(data_files))
+            print(f"Number of files {len(data_files)} after adding {path} glob {filespec}")
+        #add_jsonlines_dir(f"/data/c4_cleaned2", "*.gz")
+        #add_jsonlines_dir(f"/data/nrc_uniq_cleaned_20210223", "*.gz")
+        add_jsonlines_dir(f"/data/nu_uniq_cleaned_20210225", "*.gz")
+        random.Random(42).shuffle(data_files)
+        total = len(data_files)
+        print(total)
+        perc = 0.05
+        val_size = int(perc * total)
+        train_size = total - val_size
+        train = data_files[5:8]
+        val = data_files[1:3]
+        print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
+        assert list(set(train) & set(val)) == [], "Train overlaps with test"
+        datasets = load_dataset('json', data_files={'train': train, 'validation': val},cache_dir="/home/dat/.cache/huggingface/datasets/json/default-9add402b38836560/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723")
+    #from datasets import Dataset
+    #dataset = Dataset.from_file("/home/dat/.cache/huggingface/datasets/json/default-9add402b38836560/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723/json-train.arrow")
+    #dataset = Dataset.from_file("/home/dat/.cache/huggingface/datasets/json/default-9add402b38836560/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723/json-validation.arrow")
+    def mb_item(x):
+        return x.item() if hasattr(x, "item") else x
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    else:
+        column_names = datasets["validation"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+    if data_args.line_by_line:
+        # When using line_by_line, we just tokenize each nonempty line.
+        padding = "max_length" if data_args.pad_to_max_length else False
+        def tokenize_function(examples):
+            # Remove empty lines
+            examples = [line for line in examples if len(line) > 0 and not line.isspace()]
+            return tokenizer(
+                examples,
+                return_special_tokens_mask=True,
+                padding=padding,
+                truncation=True,
+                max_length=max_seq_length,
+            )
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            input_columns=[text_column_name],
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        tokenized_datasets.save_to_disk("/data/tokenized_data")
+        print ("save data")
+    else:
+        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+        # efficient when it receives the `special_tokens_mask`.
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+        # max_seq_length.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            if total_length >= max_seq_length:
+                total_length = (total_length // max_seq_length) * max_seq_length
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        tokenized_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        tokenized_datasets.save_to_disk("/data/tokenized_data")
+        print ("save data")

train_tokenizer.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import glob
+import random
+from tokenizers import ByteLevelBPETokenizer
+from datasets import load_dataset
+data_files = []
+def add_jsonlines_dir(path, filespec):
+    global data_files
+    data_files += glob.glob(f"{path}/{filespec}")
+    data_files = list(set(data_files))
+    print(f"Number of files {len(data_files)} after adding {path} glob {filespec}")
+add_jsonlines_dir(f"/data/c4_cleaned2", "*.gz")
+add_jsonlines_dir(f"/data/nrc_uniq_cleaned_20210223", "*.gz")
+add_jsonlines_dir(f"/data/nu_uniq_cleaned_20210225", "*.gz")
+random.Random(42).shuffle(data_files)
+total = len(data_files)
+print(total)
+perc = 0.05
+val_size = int(perc * total)
+train_size = total - val_size
+train = data_files[:train_size]
+val = data_files[train_size:]
+print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
+assert list(set(train) & set(val)) == [], "Train overlaps with test"
+datasets = load_dataset('json', data_files={'train': train, 'validation': val})
+tokenizer = ByteLevelBPETokenizer()
+def batch_iterator(batch_size=1000):
+    for i in range(0, len(datasets), batch_size):
+        yield datasets["train"][i: i + batch_size]["text"]
+tokenizer.train_from_iterator(batch_iterator(), vocab_size=50358, min_frequency=2, special_tokens=[
+    "<s>",
+    "<pad>",
+    "</s>",
+    "<unk>",
+    "<mask>",
+])
+tokenizer.save("tokenizer.json")

wandb/debug-internal.log CHANGED Viewed

	@@ -1 +1 @@
1	- run-~~20210713_010630~~-~~14xhiyhf~~/logs/debug-internal.log


1	+ run-20210714_210351-1msvb4w4/logs/debug-internal.log

wandb/debug.log CHANGED Viewed

	@@ -1 +1 @@
1	- run-~~20210713_010630~~-~~14xhiyhf~~/logs/debug.log


1	+ run-20210714_210351-1msvb4w4/logs/debug.log

wandb/latest-run CHANGED Viewed

	@@ -1 +1 @@
1	- run-~~20210713_010630~~-~~14xhiyhf~~


1	+ run-20210714_210351-1msvb4w4

wandb/run-20210713_010630-14xhiyhf/files/output.log CHANGED Viewed

@@ -16222,3 +16222,12 @@ Training...:  64%|████████████▊       | 59500/92767 [9
 Training...:  65%|████████████▉       | 60000/92767 [9:35:07<5:11:39,  1.75it/s]
 git-lfs/2.9.2 (GitHub; linux amd64; go 1.13.5)92767 [9:35:07<5:11:39,  1.75it/s]

 Training...:  65%|████████████▉       | 60000/92767 [9:35:07<5:11:39,  1.75it/s]
 git-lfs/2.9.2 (GitHub; linux amd64; go 1.13.5)92767 [9:35:07<5:11:39,  1.75it/s]
+[10:43:30] - DEBUG - huggingface_hub.repository - [Repository] is a valid git repo
+[10:44:08] - INFO - huggingface_hub.repository - Uploading LFS objects: 100% (3/3), 1.0 GB | 43 MB/s, done.
+[10:44:09] - INFO - absl - Saving checkpoint at step: 60000
+tcmalloc: large alloc 1363968000 bytes == 0x2ed6e2000 @  0x7f170bb8c680 0x7f170bbacbdd 0x7f143fe0e20d 0x7f143fe1c340 0x7f143fe1be87 0x7f143fe1be87 0x7f143fe1be87 0x7f143fe1be87 0x7f143fe1be87 0x7f143fe1be87 0x7f143fe1be87 0x7f143fe1be87 0x7f143fe1be87 0x7f143fe1be87 0x7f143fe1be87 0x7f143fe17bd3 0x7f143fe181fe 0x504d56 0x56acb6 0x568d9a 0x5f5b33 0x56bc9b 0x5f5956 0x56aadf 0x5f5956 0x56fb87 0x568d9a 0x5f5b33 0x56bc9b 0x568d9a 0x68cdc7
+[10:44:13] - INFO - absl - Saved checkpoint at checkpoint_60000

wandb/run-20210713_010630-14xhiyhf/logs/debug-internal.log CHANGED Viewed

@@ -22396,3 +22396,27 @@
 2021-07-13 10:43:28,960 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/wandb-summary.json
 2021-07-13 10:43:29,961 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/output.log
 2021-07-13 10:43:31,962 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/output.log

 2021-07-13 10:43:28,960 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/wandb-summary.json
 2021-07-13 10:43:29,961 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/output.log
 2021-07-13 10:43:31,962 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/output.log
+2021-07-13 10:43:36,601 DEBUG   HandlerThread:332390 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:43:36,601 DEBUG   SenderThread:332390 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:43:51,734 DEBUG   HandlerThread:332390 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:43:51,734 DEBUG   SenderThread:332390 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:43:55,447 DEBUG   SenderThread:332390 [sender.py:send():179] send: stats
+2021-07-13 10:44:06,865 DEBUG   HandlerThread:332390 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:44:06,866 DEBUG   SenderThread:332390 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:44:09,977 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/output.log
+2021-07-13 10:44:14,979 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/output.log
+2021-07-13 10:44:16,979 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/output.log
+2021-07-13 10:44:18,980 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/output.log
+2021-07-13 10:44:20,981 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/output.log
+2021-07-13 10:44:22,005 DEBUG   HandlerThread:332390 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:44:22,005 DEBUG   SenderThread:332390 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:44:22,982 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/output.log
+2021-07-13 10:44:23,482 WARNING MainThread:332390 [internal.py:wandb_internal():147] Internal process interrupt: 1
+2021-07-13 10:44:24,702 WARNING MainThread:332390 [internal.py:wandb_internal():147] Internal process interrupt: 2
+2021-07-13 10:44:24,703 ERROR   MainThread:332390 [internal.py:wandb_internal():150] Internal process interrupted.
+2021-07-13 10:44:24,982 INFO    Thread-8  :332390 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/files/output.log
+2021-07-13 10:44:25,021 INFO    SenderThread:332390 [sender.py:finish():945] shutting down sender
+2021-07-13 10:44:25,022 INFO    SenderThread:332390 [dir_watcher.py:finish():282] shutting down directory watcher
+2021-07-13 10:44:25,022 INFO    WriterThread:332390 [datastore.py:close():288] close: /home/dat/pino-roberta-base/wandb/run-20210713_010630-14xhiyhf/run-14xhiyhf.wandb
+2021-07-13 10:44:25,022 INFO    HandlerThread:332390 [handler.py:finish():638] shutting down handler
+2021-07-13 10:44:25,103 INFO    MainThread:332390 [internal.py:handle_exit():78] Internal process exited

wandb/run-20210713_010630-14xhiyhf/logs/debug.log CHANGED Viewed

@@ -23,3 +23,5 @@ config: {}
 2021-07-13 01:06:32,711 INFO    MainThread:330819 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 2, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 5000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul13_01-05-41_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 500, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 20000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 92768, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''}
 2021-07-13 01:06:32,712 INFO    MainThread:330819 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
 2021-07-13 01:06:32,714 INFO    MainThread:330819 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 64, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False}

 2021-07-13 01:06:32,711 INFO    MainThread:330819 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 2, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 5000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul13_01-05-41_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 500, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 20000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 92768, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''}
 2021-07-13 01:06:32,712 INFO    MainThread:330819 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
 2021-07-13 01:06:32,714 INFO    MainThread:330819 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 64, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False}
+2021-07-13 10:44:23,634 INFO    MainThread:330819 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 255
+2021-07-13 10:44:23,634 INFO    MainThread:330819 [wandb_run.py:_restore():1565] restore

wandb/run-20210713_010630-14xhiyhf/run-14xhiyhf.wandb CHANGED Viewed

Binary files a/wandb/run-20210713_010630-14xhiyhf/run-14xhiyhf.wandb and b/wandb/run-20210713_010630-14xhiyhf/run-14xhiyhf.wandb differ

wandb/run-20210713_104745-1rl2j7or/files/config.yaml ADDED Viewed

	@@ -0,0 +1,304 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.10.33
+    framework: huggingface
+    huggingface_version: 4.9.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    python_version: 3.8.10
+    t:
+      1:
+      - 3
+      - 11
+      4: 3.8.10
+      5: 0.10.33
+      6: 4.9.0.dev0
+      8:
+      - 5
+adafactor:
+  desc: null
+  value: false
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.98
+adam_epsilon:
+  desc: null
+  value: 1.0e-08
+cache_dir:
+  desc: null
+  value: null
+config_name:
+  desc: null
+  value: ./
+dataloader_drop_last:
+  desc: null
+  value: false
+dataloader_num_workers:
+  desc: null
+  value: 0
+dataloader_pin_memory:
+  desc: null
+  value: true
+dataset_config_name:
+  desc: null
+  value: null
+dataset_name:
+  desc: null
+  value: null
+ddp_find_unused_parameters:
+  desc: null
+  value: null
+debug:
+  desc: null
+  value: []
+deepspeed:
+  desc: null
+  value: null
+disable_tqdm:
+  desc: null
+  value: false
+do_eval:
+  desc: null
+  value: false
+do_predict:
+  desc: null
+  value: false
+do_train:
+  desc: null
+  value: false
+dtype:
+  desc: null
+  value: float32
+eval_accumulation_steps:
+  desc: null
+  value: null
+eval_steps:
+  desc: null
+  value: 100001
+evaluation_strategy:
+  desc: null
+  value: IntervalStrategy.NO
+fp16:
+  desc: null
+  value: false
+fp16_backend:
+  desc: null
+  value: auto
+fp16_full_eval:
+  desc: null
+  value: false
+fp16_opt_level:
+  desc: null
+  value: O1
+gradient_accumulation_steps:
+  desc: null
+  value: 2
+greater_is_better:
+  desc: null
+  value: null
+group_by_length:
+  desc: null
+  value: false
+ignore_data_skip:
+  desc: null
+  value: false
+label_names:
+  desc: null
+  value: null
+label_smoothing_factor:
+  desc: null
+  value: 0.0
+learning_rate:
+  desc: null
+  value: 5.0e-05
+length_column_name:
+  desc: null
+  value: length
+line_by_line:
+  desc: null
+  value: false
+load_best_model_at_end:
+  desc: null
+  value: false
+local_rank:
+  desc: null
+  value: -1
+log_level:
+  desc: null
+  value: -1
+log_level_replica:
+  desc: null
+  value: -1
+log_on_each_node:
+  desc: null
+  value: true
+logging_dir:
+  desc: null
+  value: ./runs/Jul13_10-47-16_t1v-n-f5c06ea1-w-0
+logging_first_step:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 50
+logging_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+lr_scheduler_type:
+  desc: null
+  value: SchedulerType.LINEAR
+max_grad_norm:
+  desc: null
+  value: 1.0
+max_seq_length:
+  desc: null
+  value: 4096
+max_steps:
+  desc: null
+  value: -1
+metric_for_best_model:
+  desc: null
+  value: null
+mlm_probability:
+  desc: null
+  value: 0.15
+model_name_or_path:
+  desc: null
+  value: null
+model_type:
+  desc: null
+  value: big_bird
+mp_parameters:
+  desc: null
+  value: ''
+no_cuda:
+  desc: null
+  value: false
+num_train_epochs:
+  desc: null
+  value: 5.0
+output_dir:
+  desc: null
+  value: ./
+overwrite_cache:
+  desc: null
+  value: false
+overwrite_output_dir:
+  desc: null
+  value: true
+pad_to_max_length:
+  desc: null
+  value: false
+past_index:
+  desc: null
+  value: -1
+per_device_eval_batch_size:
+  desc: null
+  value: 2
+per_device_train_batch_size:
+  desc: null
+  value: 2
+per_gpu_eval_batch_size:
+  desc: null
+  value: null
+per_gpu_train_batch_size:
+  desc: null
+  value: null
+prediction_loss_only:
+  desc: null
+  value: false
+preprocessing_num_workers:
+  desc: null
+  value: 64
+push_to_hub:
+  desc: null
+  value: true
+push_to_hub_model_id:
+  desc: null
+  value: ''
+push_to_hub_organization:
+  desc: null
+  value: null
+push_to_hub_token:
+  desc: null
+  value: null
+remove_unused_columns:
+  desc: null
+  value: true
+report_to:
+  desc: null
+  value:
+  - tensorboard
+  - wandb
+resume_from_checkpoint:
+  desc: null
+  value: null
+run_name:
+  desc: null
+  value: ./
+save_on_each_node:
+  desc: null
+  value: false
+save_steps:
+  desc: null
+  value: 20000
+save_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+save_total_limit:
+  desc: null
+  value: 5
+seed:
+  desc: null
+  value: 42
+sharded_ddp:
+  desc: null
+  value: []
+skip_memory_metrics:
+  desc: null
+  value: true
+tokenizer_name:
+  desc: null
+  value: ./
+tpu_metrics_debug:
+  desc: null
+  value: false
+tpu_num_cores:
+  desc: null
+  value: null
+train_file:
+  desc: null
+  value: null
+train_ref_file:
+  desc: null
+  value: null
+use_fast_tokenizer:
+  desc: null
+  value: true
+use_legacy_prediction_loop:
+  desc: null
+  value: false
+validation_file:
+  desc: null
+  value: null
+validation_ref_file:
+  desc: null
+  value: null
+validation_split_percentage:
+  desc: null
+  value: 5
+warmup_ratio:
+  desc: null
+  value: 0.0
+warmup_steps:
+  desc: null
+  value: 10
+weight_decay:
+  desc: null
+  value: 0.0095

wandb/run-20210713_104745-1rl2j7or/files/output.log ADDED Viewed

	@@ -0,0 +1,57 @@

+/home/dat/pino/lib/python3.8/site-packages/jax/_src/numpy/lax_numpy.py:3114: UserWarning: Explicitly requested dtype <class 'jax._src.numpy.lax_numpy.int64'> requested in zeros is not available, and will be truncated to dtype int32. To enable more dtypes, set the jax_enable_x64 configuration option or the JAX_ENABLE_X64 shell environment variable. See https://github.com/google/jax#current-gotchas for more.
+  lax._check_user_dtype_supported(dtype, "zeros")
+/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:382: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
+  warnings.warn(
+/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:369: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.
+  warnings.warn(
+Epoch ... (1/5):   0%|                                    | 0/5 [00:00<?, ?it/s]
+Training...:  60%|██████████████████            | 50/83 [01:32<00:23,  1.40it/s]
+Epoch ... (1/5):  20%|█████▍                     | 1/5 [02:00<08:02, 120.70s/it]
+Training...:  16%|████▋                         | 13/83 [00:07<00:53,  1.32it/s]
+Training...:  78%|███████████████████████▍      | 65/83 [00:44<00:24,  1.38s/it]
+Epoch ... (1/5):  40%|███████████▏                | 2/5 [03:06<04:25, 88.56s/it]
+Training...:  22%|██████▉                         | 18/83 [00:01<00:07,  9.26it/s]
+Epoch ... (1/5):  60%|████████████████▊           | 3/5 [04:12<02:36, 78.08s/it]s]
+Step... (150 | Loss: 7.8581647872924805, Learning Rate: 2.256410152767785e-05)
+Training...:  33%|███████████                       | 27/83 [00:03<00:06,  9.31it/s]
+Training...:  93%|███████████████████████████████▌  | 77/83 [00:32<00:04,  1.41it/s]
+Epoch ... (1/5):  80%|██████████████████████▍     | 4/5 [05:18<01:13, 73.25s/it]/it]

wandb/run-20210713_104745-1rl2j7or/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,92 @@

+absl-py==0.13.0
+aiohttp==3.7.4.post0
+astunparse==1.6.3
+async-timeout==3.0.1
+attrs==21.2.0
+cachetools==4.2.2
+certifi==2021.5.30
+chardet==4.0.0
+chex==0.0.8
+click==8.0.1
+configparser==5.0.2
+cycler==0.10.0
+datasets==1.9.1.dev0
+dill==0.3.4
+dm-tree==0.1.6
+docker-pycreds==0.4.0
+filelock==3.0.12
+flatbuffers==1.12
+flax==0.3.4
+fsspec==2021.6.1
+gast==0.4.0
+gitdb==4.0.7
+gitpython==3.1.18
+google-auth-oauthlib==0.4.4
+google-auth==1.32.1
+google-pasta==0.2.0
+grpcio==1.34.1
+h5py==3.1.0
+huggingface-hub==0.0.12
+idna==2.10
+jax==0.2.16
+jaxlib==0.1.68
+joblib==1.0.1
+keras-nightly==2.5.0.dev2021032900
+keras-preprocessing==1.1.2
+kiwisolver==1.3.1
+libtpu-nightly==0.1.dev20210615
+markdown==3.3.4
+matplotlib==3.4.2
+msgpack==1.0.2
+multidict==5.1.0
+multiprocess==0.70.12.2
+numpy==1.19.5
+oauthlib==3.1.1
+opt-einsum==3.3.0
+optax==0.0.9
+packaging==21.0
+pandas==1.3.0
+pathtools==0.1.2
+pillow==8.3.1
+pip==20.0.2
+pkg-resources==0.0.0
+promise==2.3
+protobuf==3.17.3
+psutil==5.8.0
+pyarrow==4.0.1
+pyasn1-modules==0.2.8
+pyasn1==0.4.8
+pyparsing==2.4.7
+python-dateutil==2.8.1
+pytz==2021.1
+pyyaml==5.4.1
+regex==2021.7.6
+requests-oauthlib==1.3.0
+requests==2.25.1
+rsa==4.7.2
+sacremoses==0.0.45
+scipy==1.7.0
+sentry-sdk==1.3.0
+setuptools==44.0.0
+shortuuid==1.0.1
+six==1.15.0
+smmap==4.0.0
+subprocess32==3.5.4
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.0
+tensorboard==2.5.0
+tensorflow-estimator==2.5.0
+tensorflow==2.5.0
+termcolor==1.1.0
+tokenizers==0.10.3
+toolz==0.11.1
+tqdm==4.61.2
+transformers==4.9.0.dev0
+typing-extensions==3.7.4.3
+urllib3==1.26.6
+wandb==0.10.33
+werkzeug==2.0.1
+wheel==0.36.2
+wrapt==1.12.1
+xxhash==2.0.2
+yarl==1.6.3

wandb/run-20210713_104745-1rl2j7or/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+    "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
+    "python": "3.8.10",
+    "heartbeatAt": "2021-07-13T10:47:47.215746",
+    "startedAt": "2021-07-13T10:47:45.129053",
+    "docker": null,
+    "cpu_count": 96,
+    "cuda": null,
+    "args": [
+        "--push_to_hub",
+        "--output_dir=./",
+        "--model_type=big_bird",
+        "--config_name=./",
+        "--tokenizer_name=./",
+        "--max_seq_length=4096",
+        "--weight_decay=0.0095",
+        "--warmup_steps=10",
+        "--overwrite_output_dir",
+        "--adam_beta1=0.9",
+        "--adam_beta2=0.98",
+        "--logging_steps=50",
+        "--eval_steps=100001",
+        "--num_train_epochs=5",
+        "--preprocessing_num_workers=64",
+        "--save_steps=20000",
+        "--learning_rate=5e-5",
+        "--per_device_train_batch_size=2",
+        "--per_device_eval_batch_size=2",
+        "--save_total_limit=5",
+        "--gradient_accumulation_steps=2"
+    ],
+    "state": "running",
+    "program": "./run_mlm_flax.py",
+    "codePath": "run_mlm_flax.py",
+    "git": {
+        "remote": "https://huggingface.co/flax-community/pino-roberta-base",
+        "commit": "bc11ccfe77236f87575711b26034b9751449de4b"
+    },
+    "email": null,
+    "root": "/home/dat/pino-roberta-base",
+    "host": "t1v-n-f5c06ea1-w-0",
+    "username": "dat",
+    "executable": "/home/dat/pino/bin/python"
+}

wandb/run-20210713_104745-1rl2j7or/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"training_step": 200, "learning_rate": 1.0769229447760154e-05, "train_loss": 7.618040084838867, "_runtime": 333, "_timestamp": 1626173598, "_step": 6}

wandb/run-20210713_104745-1rl2j7or/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,181 @@

+2021-07-13 10:47:45,828 INFO    MainThread:342403 [internal.py:wandb_internal():88] W&B internal server running at pid: 342403, started at: 2021-07-13 10:47:45.828158
+2021-07-13 10:47:45,830 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: check_version
+2021-07-13 10:47:45,830 INFO    WriterThread:342403 [datastore.py:open_for_write():80] open: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/run-1rl2j7or.wandb
+2021-07-13 10:47:45,831 DEBUG   SenderThread:342403 [sender.py:send():179] send: header
+2021-07-13 10:47:45,831 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: check_version
+2021-07-13 10:47:45,871 DEBUG   SenderThread:342403 [sender.py:send():179] send: run
+2021-07-13 10:47:46,041 INFO    SenderThread:342403 [dir_watcher.py:__init__():168] watching files in: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files
+2021-07-13 10:47:46,041 INFO    SenderThread:342403 [sender.py:_start_run_threads():716] run started: 1rl2j7or with start time 1626173265
+2021-07-13 10:47:46,041 DEBUG   SenderThread:342403 [sender.py:send():179] send: summary
+2021-07-13 10:47:46,041 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: run_start
+2021-07-13 10:47:46,042 INFO    SenderThread:342403 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 10:47:47,043 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/wandb-summary.json
+2021-07-13 10:47:47,215 DEBUG   HandlerThread:342403 [meta.py:__init__():39] meta init
+2021-07-13 10:47:47,215 DEBUG   HandlerThread:342403 [meta.py:__init__():53] meta init done
+2021-07-13 10:47:47,215 DEBUG   HandlerThread:342403 [meta.py:probe():210] probe
+2021-07-13 10:47:47,217 DEBUG   HandlerThread:342403 [meta.py:_setup_git():200] setup git
+2021-07-13 10:47:47,250 DEBUG   HandlerThread:342403 [meta.py:_setup_git():207] setup git done
+2021-07-13 10:47:47,250 DEBUG   HandlerThread:342403 [meta.py:_save_pip():57] save pip
+2021-07-13 10:47:47,251 DEBUG   HandlerThread:342403 [meta.py:_save_pip():71] save pip done
+2021-07-13 10:47:47,251 DEBUG   HandlerThread:342403 [meta.py:probe():252] probe done
+2021-07-13 10:47:47,255 DEBUG   SenderThread:342403 [sender.py:send():179] send: files
+2021-07-13 10:47:47,255 INFO    SenderThread:342403 [sender.py:_save_file():841] saving file wandb-metadata.json with policy now
+2021-07-13 10:47:47,262 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:47:47,262 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:47:47,394 DEBUG   SenderThread:342403 [sender.py:send():179] send: config
+2021-07-13 10:47:47,394 DEBUG   SenderThread:342403 [sender.py:send():179] send: config
+2021-07-13 10:47:47,394 DEBUG   SenderThread:342403 [sender.py:send():179] send: config
+2021-07-13 10:47:47,719 INFO    Thread-11 :342403 [upload_job.py:push():137] Uploaded file /tmp/tmpta17r5ywwandb/1f1555en-wandb-metadata.json
+2021-07-13 10:47:48,042 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/wandb-metadata.json
+2021-07-13 10:47:48,042 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/requirements.txt
+2021-07-13 10:47:48,042 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:48:02,047 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:48:02,398 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:48:02,398 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:48:04,048 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:48:15,296 DEBUG   SenderThread:342403 [sender.py:send():179] send: stats
+2021-07-13 10:48:17,054 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/config.yaml
+2021-07-13 10:48:17,555 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:48:17,556 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:48:32,709 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:48:32,710 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:48:45,371 DEBUG   SenderThread:342403 [sender.py:send():179] send: stats
+2021-07-13 10:48:47,840 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:48:47,840 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:49:02,980 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:49:02,980 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:49:15,445 DEBUG   SenderThread:342403 [sender.py:send():179] send: stats
+2021-07-13 10:49:18,113 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:49:18,113 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:49:24,080 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:49:26,080 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:49:28,081 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:49:30,082 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:49:32,083 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:49:33,242 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:49:33,243 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:49:34,084 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:49:36,084 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:49:45,514 DEBUG   SenderThread:342403 [sender.py:send():179] send: stats
+2021-07-13 10:49:48,375 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:49:48,375 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:49:58,179 DEBUG   SenderThread:342403 [sender.py:send():179] send: history
+2021-07-13 10:49:58,180 DEBUG   SenderThread:342403 [sender.py:send():179] send: summary
+2021-07-13 10:49:58,180 INFO    SenderThread:342403 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 10:49:59,093 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/wandb-summary.json
+2021-07-13 10:50:00,093 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:50:02,094 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:50:03,510 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:50:03,510 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:50:04,095 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:50:15,583 DEBUG   SenderThread:342403 [sender.py:send():179] send: stats
+2021-07-13 10:50:18,643 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:50:18,643 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:50:24,102 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:50:28,758 DEBUG   SenderThread:342403 [sender.py:send():179] send: history
+2021-07-13 10:50:28,759 DEBUG   SenderThread:342403 [sender.py:send():179] send: summary
+2021-07-13 10:50:28,763 INFO    SenderThread:342403 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 10:50:29,104 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/wandb-summary.json
+2021-07-13 10:50:30,105 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:50:32,106 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:50:33,775 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:50:33,776 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:50:34,107 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:50:36,107 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:50:38,108 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:50:40,109 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:50:42,110 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:50:45,653 DEBUG   SenderThread:342403 [sender.py:send():179] send: stats
+2021-07-13 10:50:48,905 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:50:48,906 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:51:04,035 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:51:04,035 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:51:04,964 DEBUG   SenderThread:342403 [sender.py:send():179] send: history
+2021-07-13 10:51:04,964 DEBUG   SenderThread:342403 [sender.py:send():179] send: summary
+2021-07-13 10:51:04,964 INFO    SenderThread:342403 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 10:51:05,119 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/wandb-summary.json
+2021-07-13 10:51:06,119 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:51:08,120 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:51:15,726 DEBUG   SenderThread:342403 [sender.py:send():179] send: stats
+2021-07-13 10:51:19,168 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:51:19,168 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:51:24,126 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:51:26,127 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:51:34,303 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:51:34,303 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:51:35,557 DEBUG   SenderThread:342403 [sender.py:send():179] send: history
+2021-07-13 10:51:35,558 DEBUG   SenderThread:342403 [sender.py:send():179] send: summary
+2021-07-13 10:51:35,558 INFO    SenderThread:342403 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 10:51:36,131 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/wandb-summary.json
+2021-07-13 10:51:36,132 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:51:38,132 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:51:40,133 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:51:42,134 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:51:44,135 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:51:45,797 DEBUG   SenderThread:342403 [sender.py:send():179] send: stats
+2021-07-13 10:51:46,136 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:51:48,137 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:51:49,438 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:51:49,438 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:51:50,137 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:52:04,579 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:52:04,580 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:52:11,761 DEBUG   SenderThread:342403 [sender.py:send():179] send: history
+2021-07-13 10:52:11,762 DEBUG   SenderThread:342403 [sender.py:send():179] send: summary
+2021-07-13 10:52:11,763 INFO    SenderThread:342403 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 10:52:12,146 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/wandb-summary.json
+2021-07-13 10:52:14,147 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:52:15,867 DEBUG   SenderThread:342403 [sender.py:send():179] send: stats
+2021-07-13 10:52:19,709 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:52:19,710 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:52:24,150 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:52:26,151 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:52:34,838 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:52:34,839 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:52:42,378 DEBUG   SenderThread:342403 [sender.py:send():179] send: history
+2021-07-13 10:52:42,378 DEBUG   SenderThread:342403 [sender.py:send():179] send: summary
+2021-07-13 10:52:42,379 INFO    SenderThread:342403 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 10:52:43,158 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/wandb-summary.json
+2021-07-13 10:52:45,159 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:52:45,939 DEBUG   SenderThread:342403 [sender.py:send():179] send: stats
+2021-07-13 10:52:47,160 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:52:49,161 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:52:49,969 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:52:49,970 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:52:51,161 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:52:53,162 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:52:55,163 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:52:57,164 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:53:05,101 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:53:05,101 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:53:16,014 DEBUG   SenderThread:342403 [sender.py:send():179] send: stats
+2021-07-13 10:53:18,580 DEBUG   SenderThread:342403 [sender.py:send():179] send: history
+2021-07-13 10:53:18,580 DEBUG   SenderThread:342403 [sender.py:send():179] send: summary
+2021-07-13 10:53:18,580 INFO    SenderThread:342403 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 10:53:19,173 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/wandb-summary.json
+2021-07-13 10:53:20,233 DEBUG   HandlerThread:342403 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 10:53:20,234 DEBUG   SenderThread:342403 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 10:53:21,173 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:53:25,175 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:53:27,176 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:53:29,177 INFO    Thread-8  :342403 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:53:34,237 WARNING MainThread:342403 [internal.py:wandb_internal():147] Internal process interrupt: 1
+2021-07-13 10:53:34,484 WARNING MainThread:342403 [internal.py:wandb_internal():147] Internal process interrupt: 2
+2021-07-13 10:53:34,484 ERROR   MainThread:342403 [internal.py:wandb_internal():150] Internal process interrupted.
+2021-07-13 10:53:35,385 INFO    WriterThread:342403 [datastore.py:close():288] close: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/run-1rl2j7or.wandb
+2021-07-13 10:53:35,409 INFO    SenderThread:342403 [sender.py:finish():945] shutting down sender
+2021-07-13 10:53:35,409 INFO    SenderThread:342403 [dir_watcher.py:finish():282] shutting down directory watcher
+2021-07-13 10:53:35,414 INFO    HandlerThread:342403 [handler.py:finish():638] shutting down handler
+2021-07-13 10:53:36,180 INFO    SenderThread:342403 [dir_watcher.py:finish():312] scan: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files
+2021-07-13 10:53:36,180 INFO    SenderThread:342403 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/requirements.txt requirements.txt
+2021-07-13 10:53:36,180 INFO    SenderThread:342403 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log output.log
+2021-07-13 10:53:36,180 INFO    SenderThread:342403 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/wandb-metadata.json wandb-metadata.json
+2021-07-13 10:53:36,180 INFO    SenderThread:342403 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/config.yaml config.yaml
+2021-07-13 10:53:36,181 INFO    SenderThread:342403 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/wandb-summary.json wandb-summary.json
+2021-07-13 10:53:36,181 INFO    SenderThread:342403 [file_pusher.py:finish():177] shutting down file pusher
+2021-07-13 10:53:36,181 INFO    SenderThread:342403 [file_pusher.py:join():182] waiting for file pusher
+2021-07-13 10:53:36,622 INFO    Thread-14 :342403 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/config.yaml
+2021-07-13 10:53:36,624 INFO    Thread-15 :342403 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/wandb-summary.json
+2021-07-13 10:53:36,634 INFO    Thread-13 :342403 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/output.log
+2021-07-13 10:53:36,654 INFO    Thread-12 :342403 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/files/requirements.txt
+2021-07-13 10:53:37,518 INFO    MainThread:342403 [internal.py:handle_exit():78] Internal process exited

wandb/run-20210713_104745-1rl2j7or/logs/debug.log ADDED Viewed

	@@ -0,0 +1,27 @@

+2021-07-13 10:47:45,130 INFO    MainThread:340852 [wandb_setup.py:_flush():69] setting env: {}
+2021-07-13 10:47:45,130 INFO    MainThread:340852 [wandb_setup.py:_flush():69] setting login settings: {}
+2021-07-13 10:47:45,130 INFO    MainThread:340852 [wandb_init.py:_log_setup():337] Logging user logs to /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/logs/debug.log
+2021-07-13 10:47:45,130 INFO    MainThread:340852 [wandb_init.py:_log_setup():338] Logging internal logs to /home/dat/pino-roberta-base/wandb/run-20210713_104745-1rl2j7or/logs/debug-internal.log
+2021-07-13 10:47:45,131 INFO    MainThread:340852 [wandb_init.py:init():370] calling init triggers
+2021-07-13 10:47:45,131 INFO    MainThread:340852 [wandb_init.py:init():375] wandb.init called with sweep_config: {}
+config: {}
+2021-07-13 10:47:45,131 INFO    MainThread:340852 [wandb_init.py:init():419] starting backend
+2021-07-13 10:47:45,131 INFO    MainThread:340852 [backend.py:_multiprocessing_setup():70] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2021-07-13 10:47:45,179 INFO    MainThread:340852 [backend.py:ensure_launched():135] starting backend process...
+2021-07-13 10:47:45,225 INFO    MainThread:340852 [backend.py:ensure_launched():139] started backend process with pid: 342403
+2021-07-13 10:47:45,228 INFO    MainThread:340852 [wandb_init.py:init():424] backend started and connected
+2021-07-13 10:47:45,231 INFO    MainThread:340852 [wandb_init.py:init():472] updated telemetry
+2021-07-13 10:47:45,231 INFO    MainThread:340852 [wandb_init.py:init():491] communicating current version
+2021-07-13 10:47:45,870 INFO    MainThread:340852 [wandb_init.py:init():496] got version response
+2021-07-13 10:47:45,870 INFO    MainThread:340852 [wandb_init.py:init():504] communicating run to backend with 30 second timeout
+2021-07-13 10:47:46,040 INFO    MainThread:340852 [wandb_init.py:init():529] starting run threads in backend
+2021-07-13 10:47:47,259 INFO    MainThread:340852 [wandb_run.py:_console_start():1623] atexit reg
+2021-07-13 10:47:47,260 INFO    MainThread:340852 [wandb_run.py:_redirect():1497] redirect: SettingsConsole.REDIRECT
+2021-07-13 10:47:47,261 INFO    MainThread:340852 [wandb_run.py:_redirect():1502] Redirecting console.
+2021-07-13 10:47:47,262 INFO    MainThread:340852 [wandb_run.py:_redirect():1558] Redirects installed.
+2021-07-13 10:47:47,262 INFO    MainThread:340852 [wandb_init.py:init():554] run started, returning control to user process
+2021-07-13 10:47:47,268 INFO    MainThread:340852 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 2, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 10, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul13_10-47-16_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 50, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 20000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100001, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''}
+2021-07-13 10:47:47,270 INFO    MainThread:340852 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'float32'}
+2021-07-13 10:47:47,271 INFO    MainThread:340852 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 64, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False}
+2021-07-13 10:53:34,760 INFO    MainThread:340852 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 255
+2021-07-13 10:53:34,761 INFO    MainThread:340852 [wandb_run.py:_restore():1565] restore

wandb/run-20210713_104745-1rl2j7or/run-1rl2j7or.wandb ADDED Viewed

Binary file (14.8 kB). View file

wandb/run-20210713_110212-594z6oo0/files/config.yaml ADDED Viewed

	@@ -0,0 +1,307 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.10.33
+    framework: huggingface
+    huggingface_version: 4.9.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    python_version: 3.8.10
+    t:
+      1:
+      - 3
+      - 11
+      2:
+      - 3
+      - 11
+      4: 3.8.10
+      5: 0.10.33
+      6: 4.9.0.dev0
+      8:
+      - 5
+adafactor:
+  desc: null
+  value: false
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.98
+adam_epsilon:
+  desc: null
+  value: 1.0e-08
+cache_dir:
+  desc: null
+  value: null
+config_name:
+  desc: null
+  value: ./
+dataloader_drop_last:
+  desc: null
+  value: false
+dataloader_num_workers:
+  desc: null
+  value: 0
+dataloader_pin_memory:
+  desc: null
+  value: true
+dataset_config_name:
+  desc: null
+  value: null
+dataset_name:
+  desc: null
+  value: null
+ddp_find_unused_parameters:
+  desc: null
+  value: null
+debug:
+  desc: null
+  value: []
+deepspeed:
+  desc: null
+  value: null
+disable_tqdm:
+  desc: null
+  value: false
+do_eval:
+  desc: null
+  value: false
+do_predict:
+  desc: null
+  value: false
+do_train:
+  desc: null
+  value: false
+dtype:
+  desc: null
+  value: float32
+eval_accumulation_steps:
+  desc: null
+  value: null
+eval_steps:
+  desc: null
+  value: 100001
+evaluation_strategy:
+  desc: null
+  value: IntervalStrategy.NO
+fp16:
+  desc: null
+  value: false
+fp16_backend:
+  desc: null
+  value: auto
+fp16_full_eval:
+  desc: null
+  value: false
+fp16_opt_level:
+  desc: null
+  value: O1
+gradient_accumulation_steps:
+  desc: null
+  value: 2
+greater_is_better:
+  desc: null
+  value: null
+group_by_length:
+  desc: null
+  value: false
+ignore_data_skip:
+  desc: null
+  value: false
+label_names:
+  desc: null
+  value: null
+label_smoothing_factor:
+  desc: null
+  value: 0.0
+learning_rate:
+  desc: null
+  value: 5.0e-05
+length_column_name:
+  desc: null
+  value: length
+line_by_line:
+  desc: null
+  value: false
+load_best_model_at_end:
+  desc: null
+  value: false
+local_rank:
+  desc: null
+  value: -1
+log_level:
+  desc: null
+  value: -1
+log_level_replica:
+  desc: null
+  value: -1
+log_on_each_node:
+  desc: null
+  value: true
+logging_dir:
+  desc: null
+  value: ./runs/Jul13_11-01-24_t1v-n-f5c06ea1-w-0
+logging_first_step:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 500
+logging_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+lr_scheduler_type:
+  desc: null
+  value: SchedulerType.LINEAR
+max_grad_norm:
+  desc: null
+  value: 1.0
+max_seq_length:
+  desc: null
+  value: 4096
+max_steps:
+  desc: null
+  value: -1
+metric_for_best_model:
+  desc: null
+  value: null
+mlm_probability:
+  desc: null
+  value: 0.15
+model_name_or_path:
+  desc: null
+  value: null
+model_type:
+  desc: null
+  value: big_bird
+mp_parameters:
+  desc: null
+  value: ''
+no_cuda:
+  desc: null
+  value: false
+num_train_epochs:
+  desc: null
+  value: 5.0
+output_dir:
+  desc: null
+  value: ./
+overwrite_cache:
+  desc: null
+  value: false
+overwrite_output_dir:
+  desc: null
+  value: true
+pad_to_max_length:
+  desc: null
+  value: false
+past_index:
+  desc: null
+  value: -1
+per_device_eval_batch_size:
+  desc: null
+  value: 2
+per_device_train_batch_size:
+  desc: null
+  value: 2
+per_gpu_eval_batch_size:
+  desc: null
+  value: null
+per_gpu_train_batch_size:
+  desc: null
+  value: null
+prediction_loss_only:
+  desc: null
+  value: false
+preprocessing_num_workers:
+  desc: null
+  value: 64
+push_to_hub:
+  desc: null
+  value: true
+push_to_hub_model_id:
+  desc: null
+  value: ''
+push_to_hub_organization:
+  desc: null
+  value: null
+push_to_hub_token:
+  desc: null
+  value: null
+remove_unused_columns:
+  desc: null
+  value: true
+report_to:
+  desc: null
+  value:
+  - tensorboard
+  - wandb
+resume_from_checkpoint:
+  desc: null
+  value: null
+run_name:
+  desc: null
+  value: ./
+save_on_each_node:
+  desc: null
+  value: false
+save_steps:
+  desc: null
+  value: 20000
+save_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+save_total_limit:
+  desc: null
+  value: 5
+seed:
+  desc: null
+  value: 42
+sharded_ddp:
+  desc: null
+  value: []
+skip_memory_metrics:
+  desc: null
+  value: true
+tokenizer_name:
+  desc: null
+  value: ./
+tpu_metrics_debug:
+  desc: null
+  value: false
+tpu_num_cores:
+  desc: null
+  value: null
+train_file:
+  desc: null
+  value: null
+train_ref_file:
+  desc: null
+  value: null
+use_fast_tokenizer:
+  desc: null
+  value: true
+use_legacy_prediction_loop:
+  desc: null
+  value: false
+validation_file:
+  desc: null
+  value: null
+validation_ref_file:
+  desc: null
+  value: null
+validation_split_percentage:
+  desc: null
+  value: 5
+warmup_ratio:
+  desc: null
+  value: 0.0
+warmup_steps:
+  desc: null
+  value: 10
+weight_decay:
+  desc: null
+  value: 0.0095

wandb/run-20210713_110212-594z6oo0/files/output.log ADDED Viewed

	@@ -0,0 +1,39 @@

+/home/dat/pino/lib/python3.8/site-packages/jax/_src/numpy/lax_numpy.py:3114: UserWarning: Explicitly requested dtype <class 'jax._src.numpy.lax_numpy.int64'> requested in zeros is not available, and will be truncated to dtype int32. To enable more dtypes, set the jax_enable_x64 configuration option or the JAX_ENABLE_X64 shell environment variable. See https://github.com/google/jax#current-gotchas for more.
+  lax._check_user_dtype_supported(dtype, "zeros")
+/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:382: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
+  warnings.warn(
+/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:369: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.
+  warnings.warn(
+Epoch ... (1/5):   0%|                                        | 0/5 [00:00<?, ?it/s]
+Training...:   0%|                                        | 0/92767 [01:25<?, ?it/s]
+Epoch ... (1/5):   0%|                                        | 0/5 [02:57<?, ?it/s]
+Traceback (most recent call last):
+  File "./run_mlm_flax.py", line 712, in <module>
+    state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
+    return fun(*args, **kwargs)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/api.py", line 1647, in f_pmapped
+    out = pxla.xla_pmap(
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1620, in bind
+    return call_bind(self, fun, *args, **params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1551, in call_bind
+    outs = primitive.process(top_trace, fun, tracers, params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1623, in process
+    return trace.process_map(self, fun, tracers, params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 606, in process_call
+    return primitive.impl(f, *tracers, **params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 637, in xla_pmap_impl
+    return compiled_fun(*args)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 1152, in execute_replicated
+    out_bufs = compiled.execute_sharded_on_local_devices(input_bufs)
+jax._src.traceback_util.UnfilteredStackTrace: RuntimeError: Resource exhausted: Attempting to reserve 12.60G at the bottom of memory. That was not possible. There are 12.15G free, 0B reserved, and 12.13G reservable.: while running replica 0 and partition 0 of a replicated computation (other replicas may have failed as well).
+The stack trace below excludes JAX-internal frames.
+The preceding is the original exception that occurred, unmodified.
+--------------------
+The above exception was the direct cause of the following exception:
+Traceback (most recent call last):
+  File "./run_mlm_flax.py", line 712, in <module>
+    state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 1152, in execute_replicated
+    out_bufs = compiled.execute_sharded_on_local_devices(input_bufs)
+RuntimeError: Resource exhausted: Attempting to reserve 12.60G at the bottom of memory. That was not possible. There are 12.15G free, 0B reserved, and 12.13G reservable.: while running replica 0 and partition 0 of a replicated computation (other replicas may have failed as well).