dat commited on Jul 13, 2021

Commit

9bd1fec

•

1 Parent(s): 4229c91

add all

Browse files

Files changed (33) hide show

config.json +1 -1
events.out.tfevents.1626137349.t1v-n-f5c06ea1-w-0.323744.3.v2 +3 -0
events.out.tfevents.1626137580.t1v-n-f5c06ea1-w-0.325900.3.v2 +3 -0
events.out.tfevents.1626137871.t1v-n-f5c06ea1-w-0.327810.3.v2 +3 -0
run.sh +3 -2
run_mlm_flax.py +10 -10
wandb/debug-internal.log +1 -1
wandb/debug.log +1 -1
wandb/latest-run +1 -1
wandb/run-20210713_004910-3mu9pog5/files/config.yaml +307 -0
wandb/run-20210713_004910-3mu9pog5/files/output.log +376 -0
wandb/run-20210713_004910-3mu9pog5/files/requirements.txt +92 -0
wandb/run-20210713_004910-3mu9pog5/files/wandb-metadata.json +46 -0
wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json +1 -0
wandb/run-20210713_004910-3mu9pog5/logs/debug-internal.log +166 -0
wandb/run-20210713_004910-3mu9pog5/logs/debug.log +119 -0
wandb/run-20210713_004910-3mu9pog5/run-3mu9pog5.wandb +0 -0
wandb/run-20210713_005301-2ilkub1o/files/config.yaml +307 -0
wandb/run-20210713_005301-2ilkub1o/files/output.log +376 -0
wandb/run-20210713_005301-2ilkub1o/files/requirements.txt +92 -0
wandb/run-20210713_005301-2ilkub1o/files/wandb-metadata.json +46 -0
wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json +1 -0
wandb/run-20210713_005301-2ilkub1o/logs/debug-internal.log +168 -0
wandb/run-20210713_005301-2ilkub1o/logs/debug.log +127 -0
wandb/run-20210713_005301-2ilkub1o/run-2ilkub1o.wandb +0 -0
wandb/run-20210713_005751-1wnn0lyf/files/config.yaml +304 -0
wandb/run-20210713_005751-1wnn0lyf/files/output.log +216 -0
wandb/run-20210713_005751-1wnn0lyf/files/requirements.txt +92 -0
wandb/run-20210713_005751-1wnn0lyf/files/wandb-metadata.json +44 -0
wandb/run-20210713_005751-1wnn0lyf/files/wandb-summary.json +1 -0
wandb/run-20210713_005751-1wnn0lyf/logs/debug-internal.log +61 -0
wandb/run-20210713_005751-1wnn0lyf/logs/debug.log +28 -0
wandb/run-20210713_005751-1wnn0lyf/run-1wnn0lyf.wandb +0 -0

config.json CHANGED Viewed

@@ -4,7 +4,7 @@
   ],
   "attention_probs_dropout_prob": 0.1,
   "attention_type": "block_sparse",
-  "block_size": 64,
   "bos_token_id": 1,
   "eos_token_id": 2,
   "gradient_checkpointing": false,

   ],
   "attention_probs_dropout_prob": 0.1,
   "attention_type": "block_sparse",
+  "block_size": 128,
   "bos_token_id": 1,
   "eos_token_id": 2,
   "gradient_checkpointing": false,

events.out.tfevents.1626137349.t1v-n-f5c06ea1-w-0.323744.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f159e4108512bc68b8363ca06b6026ff0844d045b08ba76516f2764b90277292
+size 40

events.out.tfevents.1626137580.t1v-n-f5c06ea1-w-0.325900.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72867ca0c2d013977242562e1efa683ba957c1b4c3352c0547c72dcd0e611de8
+size 40

events.out.tfevents.1626137871.t1v-n-f5c06ea1-w-0.327810.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4e6248f4d9c467b9b97ff9829c4847d2f568eaf3b4c6b79865519f1e98780a9
+size 40

run.sh CHANGED Viewed

@@ -19,12 +19,13 @@ python ./run_mlm_flax.py \
     --num_train_epochs="5" \
     --preprocessing_num_workers="64" \
     --save_steps="20000" \
-    --adafactor \
     --learning_rate="5e-5" \
     --per_device_train_batch_size="2" \
     --per_device_eval_batch_size="2" \
     --save_total_limit="5"\
     --dtype="bfloat16" \
     #--resume_from_checkpoint="./"\
-    #--gradient_accumulation_steps="4" \

     --num_train_epochs="5" \
     --preprocessing_num_workers="64" \
     --save_steps="20000" \
     --learning_rate="5e-5" \
     --per_device_train_batch_size="2" \
     --per_device_eval_batch_size="2" \
     --save_total_limit="5"\
     --dtype="bfloat16" \
+    #--adafactor \
+    #--gradient_accumulation_steps="8" \
     #--resume_from_checkpoint="./"\

run_mlm_flax.py CHANGED Viewed

@@ -563,7 +563,7 @@ if __name__ == "__main__":
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
-    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() * training_args.gradient_accumulation_steps
     eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
     num_train_steps = len(train_dataset) // train_batch_size * num_epochs
@@ -610,9 +610,9 @@ if __name__ == "__main__":
             mask=decay_mask_fn,
         )
-    if training_args.gradient_accumulation_steps > 1:
-        optimizer = optax.MultiSteps(optimizer, training_args.gradient_accumulation_steps)
-    grad_accum_steps = training_args.gradient_accumulation_steps
     # Setup train state
@@ -650,7 +650,7 @@ if __name__ == "__main__":
         new_state = state.apply_gradients(grads=grad)
         metrics = jax.lax.pmean(
-            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step // grad_accum_steps)}, axis_name="batch"
         )
         return new_state, metrics, new_dropout_rng
@@ -696,10 +696,10 @@ if __name__ == "__main__":
         # Generate an epoch by shuffling sampling indices from the train dataset
         num_train_samples = len(train_dataset)
         train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
-        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size // grad_accum_steps)
         # Gather the indexes for creating the batch and do a training step
-        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1,initial=resume_step // grad_accum_steps )):
             samples = [train_dataset[int(idx)] for idx in batch_idx]
             model_inputs = data_collator(samples, pad_to_multiple_of=16)
@@ -713,7 +713,7 @@ if __name__ == "__main__":
             if cur_step < resume_step:
                 continue
-            if (cur_step % training_args.logging_steps * grad_accum_steps) == 0 and cur_step > 0:
                 # Save metrics
                 train_metric = jax_utils.unreplicate(train_metric)
                 train_time += time.time() - train_start
@@ -730,7 +730,7 @@ if __name__ == "__main__":
                 train_metrics = []
-            if cur_step % (training_args.eval_steps * grad_accum_steps) == 0 and cur_step > 0:
                 # ======================== Evaluating ==============================
                 num_eval_samples = len(eval_dataset)
                 eval_samples_idx = jnp.arange(num_eval_samples)
@@ -763,7 +763,7 @@ if __name__ == "__main__":
                     _metrics = {f"eval_{k}":mb_item(v) for k, v in eval_metrics.items()}
                     wandb.log({"eval_step":cur_step, **_metrics})
-            if (cur_step % training_args.save_steps == 0 * grad_accum_steps) and cur_step > 0:
                 # save checkpoint after each epoch and push checkpoint to the hub
                 if jax.process_index() == 0:
                     params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))

     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() #* training_args.gradient_accumulation_steps
     eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
     num_train_steps = len(train_dataset) // train_batch_size * num_epochs
             mask=decay_mask_fn,
         )
+    #if training_args.gradient_accumulation_steps > 1:
+    #    optimizer = optax.MultiSteps(optimizer, training_args.gradient_accumulation_steps)
+    #grad_accum_steps = training_args.gradient_accumulation_steps
     # Setup train state
         new_state = state.apply_gradients(grads=grad)
         metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step )}, axis_name="batch" #// grad_accum_steps
         )
         return new_state, metrics, new_dropout_rng
         # Generate an epoch by shuffling sampling indices from the train dataset
         num_train_samples = len(train_dataset)
         train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
+        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size) #// grad_accum_steps
         # Gather the indexes for creating the batch and do a training step
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1,initial=resume_step)): #// grad_accum_steps
             samples = [train_dataset[int(idx)] for idx in batch_idx]
             model_inputs = data_collator(samples, pad_to_multiple_of=16)
             if cur_step < resume_step:
                 continue
+            if (cur_step % training_args.logging_steps) == 0 and cur_step > 0: #* grad_accum_steps
                 # Save metrics
                 train_metric = jax_utils.unreplicate(train_metric)
                 train_time += time.time() - train_start
                 train_metrics = []
+            if cur_step % (training_args.eval_steps) == 0 and cur_step > 0: #* grad_accum_steps
                 # ======================== Evaluating ==============================
                 num_eval_samples = len(eval_dataset)
                 eval_samples_idx = jnp.arange(num_eval_samples)
                     _metrics = {f"eval_{k}":mb_item(v) for k, v in eval_metrics.items()}
                     wandb.log({"eval_step":cur_step, **_metrics})
+            if (cur_step % training_args.save_steps == 0) and cur_step > 0: #* grad_accum_steps
                 # save checkpoint after each epoch and push checkpoint to the hub
                 if jax.process_index() == 0:
                     params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))

wandb/debug-internal.log CHANGED Viewed

	@@ -1 +1 @@
1	- run-~~20210713_002031~~-~~11bfig2u~~/logs/debug-internal.log


1	+ run-20210713_005751-1wnn0lyf/logs/debug-internal.log

wandb/debug.log CHANGED Viewed

	@@ -1 +1 @@
1	- run-~~20210713_002031~~-~~11bfig2u~~/logs/debug.log


1	+ run-20210713_005751-1wnn0lyf/logs/debug.log

wandb/latest-run CHANGED Viewed

	@@ -1 +1 @@
1	- run-~~20210713_002031~~-~~11bfig2u~~


1	+ run-20210713_005751-1wnn0lyf

wandb/run-20210713_004910-3mu9pog5/files/config.yaml ADDED Viewed

	@@ -0,0 +1,307 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.10.33
+    framework: huggingface
+    huggingface_version: 4.9.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    python_version: 3.8.10
+    t:
+      1:
+      - 3
+      - 11
+      2:
+      - 3
+      - 11
+      4: 3.8.10
+      5: 0.10.33
+      6: 4.9.0.dev0
+      8:
+      - 5
+adafactor:
+  desc: null
+  value: true
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.98
+adam_epsilon:
+  desc: null
+  value: 1.0e-08
+cache_dir:
+  desc: null
+  value: null
+config_name:
+  desc: null
+  value: ./
+dataloader_drop_last:
+  desc: null
+  value: false
+dataloader_num_workers:
+  desc: null
+  value: 0
+dataloader_pin_memory:
+  desc: null
+  value: true
+dataset_config_name:
+  desc: null
+  value: null
+dataset_name:
+  desc: null
+  value: null
+ddp_find_unused_parameters:
+  desc: null
+  value: null
+debug:
+  desc: null
+  value: []
+deepspeed:
+  desc: null
+  value: null
+disable_tqdm:
+  desc: null
+  value: false
+do_eval:
+  desc: null
+  value: false
+do_predict:
+  desc: null
+  value: false
+do_train:
+  desc: null
+  value: false
+dtype:
+  desc: null
+  value: bfloat16
+eval_accumulation_steps:
+  desc: null
+  value: null
+eval_steps:
+  desc: null
+  value: 92768
+evaluation_strategy:
+  desc: null
+  value: IntervalStrategy.NO
+fp16:
+  desc: null
+  value: false
+fp16_backend:
+  desc: null
+  value: auto
+fp16_full_eval:
+  desc: null
+  value: false
+fp16_opt_level:
+  desc: null
+  value: O1
+gradient_accumulation_steps:
+  desc: null
+  value: 8
+greater_is_better:
+  desc: null
+  value: null
+group_by_length:
+  desc: null
+  value: false
+ignore_data_skip:
+  desc: null
+  value: false
+label_names:
+  desc: null
+  value: null
+label_smoothing_factor:
+  desc: null
+  value: 0.0
+learning_rate:
+  desc: null
+  value: 5.0e-05
+length_column_name:
+  desc: null
+  value: length
+line_by_line:
+  desc: null
+  value: false
+load_best_model_at_end:
+  desc: null
+  value: false
+local_rank:
+  desc: null
+  value: -1
+log_level:
+  desc: null
+  value: -1
+log_level_replica:
+  desc: null
+  value: -1
+log_on_each_node:
+  desc: null
+  value: true
+logging_dir:
+  desc: null
+  value: ./runs/Jul13_00-48-19_t1v-n-f5c06ea1-w-0
+logging_first_step:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 500
+logging_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+lr_scheduler_type:
+  desc: null
+  value: SchedulerType.LINEAR
+max_grad_norm:
+  desc: null
+  value: 1.0
+max_seq_length:
+  desc: null
+  value: 4096
+max_steps:
+  desc: null
+  value: -1
+metric_for_best_model:
+  desc: null
+  value: null
+mlm_probability:
+  desc: null
+  value: 0.15
+model_name_or_path:
+  desc: null
+  value: null
+model_type:
+  desc: null
+  value: big_bird
+mp_parameters:
+  desc: null
+  value: ''
+no_cuda:
+  desc: null
+  value: false
+num_train_epochs:
+  desc: null
+  value: 5.0
+output_dir:
+  desc: null
+  value: ./
+overwrite_cache:
+  desc: null
+  value: false
+overwrite_output_dir:
+  desc: null
+  value: true
+pad_to_max_length:
+  desc: null
+  value: false
+past_index:
+  desc: null
+  value: -1
+per_device_eval_batch_size:
+  desc: null
+  value: 4
+per_device_train_batch_size:
+  desc: null
+  value: 4
+per_gpu_eval_batch_size:
+  desc: null
+  value: null
+per_gpu_train_batch_size:
+  desc: null
+  value: null
+prediction_loss_only:
+  desc: null
+  value: false
+preprocessing_num_workers:
+  desc: null
+  value: 64
+push_to_hub:
+  desc: null
+  value: true
+push_to_hub_model_id:
+  desc: null
+  value: ''
+push_to_hub_organization:
+  desc: null
+  value: null
+push_to_hub_token:
+  desc: null
+  value: null
+remove_unused_columns:
+  desc: null
+  value: true
+report_to:
+  desc: null
+  value:
+  - tensorboard
+  - wandb
+resume_from_checkpoint:
+  desc: null
+  value: null
+run_name:
+  desc: null
+  value: ./
+save_on_each_node:
+  desc: null
+  value: false
+save_steps:
+  desc: null
+  value: 20000
+save_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+save_total_limit:
+  desc: null
+  value: 5
+seed:
+  desc: null
+  value: 42
+sharded_ddp:
+  desc: null
+  value: []
+skip_memory_metrics:
+  desc: null
+  value: true
+tokenizer_name:
+  desc: null
+  value: ./
+tpu_metrics_debug:
+  desc: null
+  value: false
+tpu_num_cores:
+  desc: null
+  value: null
+train_file:
+  desc: null
+  value: null
+train_ref_file:
+  desc: null
+  value: null
+use_fast_tokenizer:
+  desc: null
+  value: true
+use_legacy_prediction_loop:
+  desc: null
+  value: false
+validation_file:
+  desc: null
+  value: null
+validation_ref_file:
+  desc: null
+  value: null
+validation_split_percentage:
+  desc: null
+  value: 5
+warmup_ratio:
+  desc: null
+  value: 0.0
+warmup_steps:
+  desc: null
+  value: 5000
+weight_decay:
+  desc: null
+  value: 0.0095

wandb/run-20210713_004910-3mu9pog5/files/output.log ADDED Viewed

	@@ -0,0 +1,376 @@

+/home/dat/pino/lib/python3.8/site-packages/jax/_src/numpy/lax_numpy.py:3114: UserWarning: Explicitly requested dtype <class 'jax._src.numpy.lax_numpy.int64'> requested in zeros is not available, and will be truncated to dtype int32. To enable more dtypes, set the jax_enable_x64 configuration option or the JAX_ENABLE_X64 shell environment variable. See https://github.com/google/jax#current-gotchas for more.
+  lax._check_user_dtype_supported(dtype, "zeros")
+/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:382: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
+  warnings.warn(
+/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:369: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.
+  warnings.warn(
+Epoch ... (1/5):   0%|                                    | 0/5 [00:00<?, ?it/s]
+Epoch ... (1/5):   0%|                                    | 0/5 [02:22<?, ?it/s]
+Traceback (most recent call last):
+  File "./run_mlm_flax.py", line 709, in <module>
+    state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
+    return fun(*args, **kwargs)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/api.py", line 1647, in f_pmapped
+    out = pxla.xla_pmap(
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1620, in bind
+    return call_bind(self, fun, *args, **params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1551, in call_bind
+    outs = primitive.process(top_trace, fun, tracers, params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1623, in process
+    return trace.process_map(self, fun, tracers, params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 606, in process_call
+    return primitive.impl(f, *tracers, **params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 624, in xla_pmap_impl
+    compiled_fun, fingerprint = parallel_callable(fun, backend, axis_name, axis_size,
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/linear_util.py", line 262, in memoized_fun
+    ans = call(fun, *args)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 899, in parallel_callable
+    compiled = xla.backend_compile(backend, built, compile_options)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
+    return backend.compile(built_c, compile_options=options)
+jax._src.traceback_util.UnfilteredStackTrace: RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.61G of 15.48G hbm. Exceeded hbm capacity by 5.13G.
+Total hbm usage >= 21.13G:
+    reserved        530.00M
+    program          20.61G
+    arguments            0B
+Output size 0B; shares 0B with arguments.
+Program hbm requirement 20.61G:
+    global           900.0K
+    scoped           924.0K
+    HLO temp         20.61G (63.0% utilization: Unpadded (12.43G) Padded (19.71G), 4.4% fragmentation (918.84M))
+  Largest program allocations in hbm:
+  1. Size: 1.54G
+     Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n                              precision=None\n                              preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
+     Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
+     Unpadded size: 1.54G
+     Extra memory due to padding: 64.0K (1.0x expansion)
+     XLA label: %fusion.3615.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %get-tuple-element.22628, f32[768]{0:T(1024)} %fusion.10158, f32[768]{0:T(1024)} %fusion.10159, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.20129, f32[...
+     Allocation type: HLO temp
+     ==========================
+  2. Size: 360.00M
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2444.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2444.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+  3. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2454.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2804, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7916, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  4. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2453.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2803, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7915, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  5. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2452.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2802, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7914, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  6. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2451.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2801, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7913, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  7. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2445 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2795, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7907, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.20342, f32[4,12,60,64,192]{3,4,2,1...
+     Allocation type: HLO temp
+     ==========================
+  8. Size: 360.00M
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2443.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2443.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+  9. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2450.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2800, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7912, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  10. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2449.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2799, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7911, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  11. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2448.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2798, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7910, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  12. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2447.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2797, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7909, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  13. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2446.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2796, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7908, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  14. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2689.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2964), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  15. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2690.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2962), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  16. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2688.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2966), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  17. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2691.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14230, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2960), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  18. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2692.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14164, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2958), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  19. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2693.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14098, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2956), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  20. Size: 270.00M
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2616.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.2616.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+The stack trace below excludes JAX-internal frames.
+The preceding is the original exception that occurred, unmodified.
+--------------------
+The above exception was the direct cause of the following exception:
+Traceback (most recent call last):
+  File "./run_mlm_flax.py", line 709, in <module>
+    state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
+    return backend.compile(built_c, compile_options=options)
+RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.61G of 15.48G hbm. Exceeded hbm capacity by 5.13G.
+Total hbm usage >= 21.13G:
+    reserved        530.00M
+    program          20.61G
+    arguments            0B
+Output size 0B; shares 0B with arguments.
+Program hbm requirement 20.61G:
+    global           900.0K
+    scoped           924.0K
+    HLO temp         20.61G (63.0% utilization: Unpadded (12.43G) Padded (19.71G), 4.4% fragmentation (918.84M))
+  Largest program allocations in hbm:
+  1. Size: 1.54G
+     Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n                              precision=None\n                              preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
+     Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
+     Unpadded size: 1.54G
+     Extra memory due to padding: 64.0K (1.0x expansion)
+     XLA label: %fusion.3615.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %get-tuple-element.22628, f32[768]{0:T(1024)} %fusion.10158, f32[768]{0:T(1024)} %fusion.10159, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.20129, f32[...
+     Allocation type: HLO temp
+     ==========================
+  2. Size: 360.00M
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2444.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2444.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+  3. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2454.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2804, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7916, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  4. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2453.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2803, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7915, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  5. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2452.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2802, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7914, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  6. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2451.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2801, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7913, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  7. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2445 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2795, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7907, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.20342, f32[4,12,60,64,192]{3,4,2,1...
+     Allocation type: HLO temp
+     ==========================
+  8. Size: 360.00M
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2443.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2443.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+  9. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2450.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2800, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7912, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  10. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2449.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2799, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7911, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  11. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2448.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2798, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7910, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  12. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2447.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2797, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7909, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  13. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2446.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2796, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7908, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  14. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2689.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2964), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  15. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2690.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2962), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  16. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2688.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2966), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  17. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2691.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14230, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2960), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  18. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2692.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14164, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2958), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  19. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2693.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14098, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2956), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  20. Size: 270.00M
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2616.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.2616.remat_compressed)
+     Allocation type: HLO temp
+     ==========================

wandb/run-20210713_004910-3mu9pog5/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,92 @@

+absl-py==0.13.0
+aiohttp==3.7.4.post0
+astunparse==1.6.3
+async-timeout==3.0.1
+attrs==21.2.0
+cachetools==4.2.2
+certifi==2021.5.30
+chardet==4.0.0
+chex==0.0.8
+click==8.0.1
+configparser==5.0.2
+cycler==0.10.0
+datasets==1.9.1.dev0
+dill==0.3.4
+dm-tree==0.1.6
+docker-pycreds==0.4.0
+filelock==3.0.12
+flatbuffers==1.12
+flax==0.3.4
+fsspec==2021.6.1
+gast==0.4.0
+gitdb==4.0.7
+gitpython==3.1.18
+google-auth-oauthlib==0.4.4
+google-auth==1.32.1
+google-pasta==0.2.0
+grpcio==1.34.1
+h5py==3.1.0
+huggingface-hub==0.0.12
+idna==2.10
+jax==0.2.16
+jaxlib==0.1.68
+joblib==1.0.1
+keras-nightly==2.5.0.dev2021032900
+keras-preprocessing==1.1.2
+kiwisolver==1.3.1
+libtpu-nightly==0.1.dev20210615
+markdown==3.3.4
+matplotlib==3.4.2
+msgpack==1.0.2
+multidict==5.1.0
+multiprocess==0.70.12.2
+numpy==1.19.5
+oauthlib==3.1.1
+opt-einsum==3.3.0
+optax==0.0.9
+packaging==21.0
+pandas==1.3.0
+pathtools==0.1.2
+pillow==8.3.1
+pip==20.0.2
+pkg-resources==0.0.0
+promise==2.3
+protobuf==3.17.3
+psutil==5.8.0
+pyarrow==4.0.1
+pyasn1-modules==0.2.8
+pyasn1==0.4.8
+pyparsing==2.4.7
+python-dateutil==2.8.1
+pytz==2021.1
+pyyaml==5.4.1
+regex==2021.7.6
+requests-oauthlib==1.3.0
+requests==2.25.1
+rsa==4.7.2
+sacremoses==0.0.45
+scipy==1.7.0
+sentry-sdk==1.3.0
+setuptools==44.0.0
+shortuuid==1.0.1
+six==1.15.0
+smmap==4.0.0
+subprocess32==3.5.4
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.0
+tensorboard==2.5.0
+tensorflow-estimator==2.5.0
+tensorflow==2.5.0
+termcolor==1.1.0
+tokenizers==0.10.3
+toolz==0.11.1
+tqdm==4.61.2
+transformers==4.9.0.dev0
+typing-extensions==3.7.4.3
+urllib3==1.26.6
+wandb==0.10.33
+werkzeug==2.0.1
+wheel==0.36.2
+wrapt==1.12.1
+xxhash==2.0.2
+yarl==1.6.3

wandb/run-20210713_004910-3mu9pog5/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+    "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
+    "python": "3.8.10",
+    "heartbeatAt": "2021-07-13T00:49:12.868844",
+    "startedAt": "2021-07-13T00:49:10.806043",
+    "docker": null,
+    "cpu_count": 96,
+    "cuda": null,
+    "args": [
+        "--push_to_hub",
+        "--output_dir=./",
+        "--model_type=big_bird",
+        "--config_name=./",
+        "--tokenizer_name=./",
+        "--max_seq_length=4096",
+        "--weight_decay=0.0095",
+        "--warmup_steps=5000",
+        "--overwrite_output_dir",
+        "--adam_beta1=0.9",
+        "--adam_beta2=0.98",
+        "--logging_steps=500",
+        "--eval_steps=92768",
+        "--num_train_epochs=5",
+        "--preprocessing_num_workers=64",
+        "--save_steps=20000",
+        "--adafactor",
+        "--learning_rate=5e-5",
+        "--per_device_train_batch_size=4",
+        "--per_device_eval_batch_size=4",
+        "--save_total_limit=5",
+        "--dtype=bfloat16",
+        "--gradient_accumulation_steps=8"
+    ],
+    "state": "running",
+    "program": "./run_mlm_flax.py",
+    "codePath": "run_mlm_flax.py",
+    "git": {
+        "remote": "https://huggingface.co/flax-community/pino-roberta-base",
+        "commit": "4229c91b780cf07115cc6d04c16e393b0d2f508c"
+    },
+    "email": null,
+    "root": "/home/dat/pino-roberta-base",
+    "host": "t1v-n-f5c06ea1-w-0",
+    "username": "dat",
+    "executable": "/home/dat/pino/bin/python"
+}

wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

wandb/run-20210713_004910-3mu9pog5/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,166 @@

+2021-07-13 00:49:11,524 INFO    MainThread:325318 [internal.py:wandb_internal():88] W&B internal server running at pid: 325318, started at: 2021-07-13 00:49:11.523864
+2021-07-13 00:49:11,526 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: check_version
+2021-07-13 00:49:11,526 INFO    WriterThread:325318 [datastore.py:open_for_write():80] open: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/run-3mu9pog5.wandb
+2021-07-13 00:49:11,527 DEBUG   SenderThread:325318 [sender.py:send():179] send: header
+2021-07-13 00:49:11,527 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: check_version
+2021-07-13 00:49:11,564 DEBUG   SenderThread:325318 [sender.py:send():179] send: run
+2021-07-13 00:49:11,738 INFO    SenderThread:325318 [dir_watcher.py:__init__():168] watching files in: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files
+2021-07-13 00:49:11,739 INFO    SenderThread:325318 [sender.py:_start_run_threads():716] run started: 3mu9pog5 with start time 1626137350
+2021-07-13 00:49:11,739 DEBUG   SenderThread:325318 [sender.py:send():179] send: summary
+2021-07-13 00:49:11,739 INFO    SenderThread:325318 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 00:49:11,739 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: run_start
+2021-07-13 00:49:12,741 INFO    Thread-8  :325318 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json
+2021-07-13 00:49:12,868 DEBUG   HandlerThread:325318 [meta.py:__init__():39] meta init
+2021-07-13 00:49:12,868 DEBUG   HandlerThread:325318 [meta.py:__init__():53] meta init done
+2021-07-13 00:49:12,868 DEBUG   HandlerThread:325318 [meta.py:probe():210] probe
+2021-07-13 00:49:12,870 DEBUG   HandlerThread:325318 [meta.py:_setup_git():200] setup git
+2021-07-13 00:49:12,899 DEBUG   HandlerThread:325318 [meta.py:_setup_git():207] setup git done
+2021-07-13 00:49:12,899 DEBUG   HandlerThread:325318 [meta.py:_save_pip():57] save pip
+2021-07-13 00:49:12,899 DEBUG   HandlerThread:325318 [meta.py:_save_pip():71] save pip done
+2021-07-13 00:49:12,899 DEBUG   HandlerThread:325318 [meta.py:probe():252] probe done
+2021-07-13 00:49:12,903 DEBUG   SenderThread:325318 [sender.py:send():179] send: files
+2021-07-13 00:49:12,903 INFO    SenderThread:325318 [sender.py:_save_file():841] saving file wandb-metadata.json with policy now
+2021-07-13 00:49:12,910 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:49:12,911 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:49:13,042 DEBUG   SenderThread:325318 [sender.py:send():179] send: config
+2021-07-13 00:49:13,043 DEBUG   SenderThread:325318 [sender.py:send():179] send: config
+2021-07-13 00:49:13,043 DEBUG   SenderThread:325318 [sender.py:send():179] send: config
+2021-07-13 00:49:13,348 INFO    Thread-11 :325318 [upload_job.py:push():137] Uploaded file /tmp/tmpkvnk9e30wandb/65yetzns-wandb-metadata.json
+2021-07-13 00:49:13,741 INFO    Thread-8  :325318 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
+2021-07-13 00:49:13,741 INFO    Thread-8  :325318 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/requirements.txt
+2021-07-13 00:49:13,741 INFO    Thread-8  :325318 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-metadata.json
+2021-07-13 00:49:28,044 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:49:28,044 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:49:29,748 INFO    Thread-8  :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
+2021-07-13 00:49:31,749 INFO    Thread-8  :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
+2021-07-13 00:49:40,952 DEBUG   SenderThread:325318 [sender.py:send():179] send: stats
+2021-07-13 00:49:42,754 INFO    Thread-8  :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/config.yaml
+2021-07-13 00:49:43,176 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:49:43,177 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:49:58,307 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:49:58,307 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:50:11,029 DEBUG   SenderThread:325318 [sender.py:send():179] send: stats
+2021-07-13 00:50:13,441 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:50:13,442 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:50:21,769 INFO    Thread-8  :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
+2021-07-13 00:50:28,590 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:50:28,590 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:50:41,106 DEBUG   SenderThread:325318 [sender.py:send():179] send: stats
+2021-07-13 00:50:43,758 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:50:43,759 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:50:58,908 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:50:58,909 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:51:11,187 DEBUG   SenderThread:325318 [sender.py:send():179] send: stats
+2021-07-13 00:51:14,040 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:51:14,041 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:51:29,172 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:51:29,173 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:51:41,267 DEBUG   SenderThread:325318 [sender.py:send():179] send: stats
+2021-07-13 00:51:44,303 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:51:44,304 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:51:53,809 INFO    Thread-8  :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
+2021-07-13 00:51:54,323 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:51:54,323 DEBUG   SenderThread:325318 [sender.py:send():179] send: telemetry
+2021-07-13 00:51:54,323 DEBUG   SenderThread:325318 [sender.py:send():179] send: exit
+2021-07-13 00:51:54,323 INFO    SenderThread:325318 [sender.py:send_exit():287] handling exit code: 1
+2021-07-13 00:51:54,323 INFO    SenderThread:325318 [sender.py:send_exit():295] send defer
+2021-07-13 00:51:54,323 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:51:54,324 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:51:54,324 INFO    HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 0
+2021-07-13 00:51:54,324 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:51:54,324 INFO    SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 0
+2021-07-13 00:51:54,324 INFO    SenderThread:325318 [sender.py:transition_state():308] send defer: 1
+2021-07-13 00:51:54,325 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:51:54,325 INFO    HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 1
+2021-07-13 00:51:54,400 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:51:54,400 INFO    SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 1
+2021-07-13 00:51:54,400 INFO    SenderThread:325318 [sender.py:transition_state():308] send defer: 2
+2021-07-13 00:51:54,401 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:51:54,401 INFO    HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 2
+2021-07-13 00:51:54,401 DEBUG   SenderThread:325318 [sender.py:send():179] send: stats
+2021-07-13 00:51:54,401 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:51:54,401 INFO    SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 2
+2021-07-13 00:51:54,401 INFO    SenderThread:325318 [sender.py:transition_state():308] send defer: 3
+2021-07-13 00:51:54,402 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:51:54,402 INFO    HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 3
+2021-07-13 00:51:54,402 DEBUG   SenderThread:325318 [sender.py:send():179] send: summary
+2021-07-13 00:51:54,402 INFO    SenderThread:325318 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 00:51:54,403 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:51:54,403 INFO    SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 3
+2021-07-13 00:51:54,403 INFO    SenderThread:325318 [sender.py:transition_state():308] send defer: 4
+2021-07-13 00:51:54,403 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:51:54,403 INFO    HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 4
+2021-07-13 00:51:54,403 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:51:54,403 INFO    SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 4
+2021-07-13 00:51:54,426 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:51:54,590 INFO    SenderThread:325318 [sender.py:transition_state():308] send defer: 5
+2021-07-13 00:51:54,590 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:51:54,591 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:51:54,591 INFO    HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 5
+2021-07-13 00:51:54,591 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:51:54,591 INFO    SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 5
+2021-07-13 00:51:54,591 INFO    SenderThread:325318 [dir_watcher.py:finish():282] shutting down directory watcher
+2021-07-13 00:51:54,693 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:51:54,809 INFO    Thread-8  :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
+2021-07-13 00:51:54,810 INFO    SenderThread:325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/config.yaml
+2021-07-13 00:51:54,810 INFO    SenderThread:325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json
+2021-07-13 00:51:54,810 INFO    SenderThread:325318 [dir_watcher.py:finish():312] scan: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files
+2021-07-13 00:51:54,810 INFO    SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/requirements.txt requirements.txt
+2021-07-13 00:51:54,810 INFO    SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log output.log
+2021-07-13 00:51:54,811 INFO    SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-metadata.json wandb-metadata.json
+2021-07-13 00:51:54,811 INFO    SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/config.yaml config.yaml
+2021-07-13 00:51:54,811 INFO    SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json wandb-summary.json
+2021-07-13 00:51:54,811 INFO    SenderThread:325318 [sender.py:transition_state():308] send defer: 6
+2021-07-13 00:51:54,811 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:51:54,812 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:51:54,812 INFO    HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 6
+2021-07-13 00:51:54,812 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:51:54,814 INFO    SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 6
+2021-07-13 00:51:54,814 INFO    SenderThread:325318 [file_pusher.py:finish():177] shutting down file pusher
+2021-07-13 00:51:54,913 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:51:54,914 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:51:55,016 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:51:55,016 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:51:55,118 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:51:55,118 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:51:55,220 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:51:55,220 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:51:55,257 INFO    Thread-14 :325318 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/config.yaml
+2021-07-13 00:51:55,266 INFO    Thread-12 :325318 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/requirements.txt
+2021-07-13 00:51:55,277 INFO    Thread-13 :325318 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
+2021-07-13 00:51:55,288 INFO    Thread-15 :325318 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json
+2021-07-13 00:51:55,322 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:51:55,322 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:51:55,424 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:51:55,425 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:51:55,489 INFO    Thread-7  :325318 [sender.py:transition_state():308] send defer: 7
+2021-07-13 00:51:55,489 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:51:55,489 INFO    HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 7
+2021-07-13 00:51:55,489 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:51:55,490 INFO    SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 7
+2021-07-13 00:51:55,526 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:51:55,771 INFO    SenderThread:325318 [sender.py:transition_state():308] send defer: 8
+2021-07-13 00:51:55,772 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:51:55,772 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:51:55,772 INFO    HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 8
+2021-07-13 00:51:55,772 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:51:55,772 INFO    SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 8
+2021-07-13 00:51:55,773 INFO    SenderThread:325318 [sender.py:transition_state():308] send defer: 9
+2021-07-13 00:51:55,773 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:51:55,773 INFO    HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 9
+2021-07-13 00:51:55,773 DEBUG   SenderThread:325318 [sender.py:send():179] send: final
+2021-07-13 00:51:55,773 DEBUG   SenderThread:325318 [sender.py:send():179] send: footer
+2021-07-13 00:51:55,773 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:51:55,773 INFO    SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 9
+2021-07-13 00:51:55,874 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:51:55,874 DEBUG   SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:51:55,874 INFO    SenderThread:325318 [file_pusher.py:join():182] waiting for file pusher
+2021-07-13 00:51:55,876 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: get_summary
+2021-07-13 00:51:55,877 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: sampled_history
+2021-07-13 00:51:55,877 DEBUG   HandlerThread:325318 [handler.py:handle_request():124] handle_request: shutdown
+2021-07-13 00:51:55,877 INFO    HandlerThread:325318 [handler.py:finish():638] shutting down handler
+2021-07-13 00:51:56,774 INFO    WriterThread:325318 [datastore.py:close():288] close: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/run-3mu9pog5.wandb
+2021-07-13 00:51:56,875 INFO    SenderThread:325318 [sender.py:finish():945] shutting down sender
+2021-07-13 00:51:56,875 INFO    SenderThread:325318 [file_pusher.py:finish():177] shutting down file pusher
+2021-07-13 00:51:56,875 INFO    SenderThread:325318 [file_pusher.py:join():182] waiting for file pusher
+2021-07-13 00:51:56,877 INFO    MainThread:325318 [internal.py:handle_exit():78] Internal process exited

wandb/run-20210713_004910-3mu9pog5/logs/debug.log ADDED Viewed

	@@ -0,0 +1,119 @@

+2021-07-13 00:49:10,807 INFO    MainThread:323744 [wandb_setup.py:_flush():69] setting env: {}
+2021-07-13 00:49:10,807 INFO    MainThread:323744 [wandb_setup.py:_flush():69] setting login settings: {}
+2021-07-13 00:49:10,807 INFO    MainThread:323744 [wandb_init.py:_log_setup():337] Logging user logs to /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/logs/debug.log
+2021-07-13 00:49:10,807 INFO    MainThread:323744 [wandb_init.py:_log_setup():338] Logging internal logs to /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/logs/debug-internal.log
+2021-07-13 00:49:10,808 INFO    MainThread:323744 [wandb_init.py:init():370] calling init triggers
+2021-07-13 00:49:10,808 INFO    MainThread:323744 [wandb_init.py:init():375] wandb.init called with sweep_config: {}
+config: {}
+2021-07-13 00:49:10,808 INFO    MainThread:323744 [wandb_init.py:init():419] starting backend
+2021-07-13 00:49:10,808 INFO    MainThread:323744 [backend.py:_multiprocessing_setup():70] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2021-07-13 00:49:10,863 INFO    MainThread:323744 [backend.py:ensure_launched():135] starting backend process...
+2021-07-13 00:49:10,917 INFO    MainThread:323744 [backend.py:ensure_launched():139] started backend process with pid: 325318
+2021-07-13 00:49:10,919 INFO    MainThread:323744 [wandb_init.py:init():424] backend started and connected
+2021-07-13 00:49:10,923 INFO    MainThread:323744 [wandb_init.py:init():472] updated telemetry
+2021-07-13 00:49:10,924 INFO    MainThread:323744 [wandb_init.py:init():491] communicating current version
+2021-07-13 00:49:11,562 INFO    MainThread:323744 [wandb_init.py:init():496] got version response
+2021-07-13 00:49:11,563 INFO    MainThread:323744 [wandb_init.py:init():504] communicating run to backend with 30 second timeout
+2021-07-13 00:49:11,739 INFO    MainThread:323744 [wandb_init.py:init():529] starting run threads in backend
+2021-07-13 00:49:12,907 INFO    MainThread:323744 [wandb_run.py:_console_start():1623] atexit reg
+2021-07-13 00:49:12,907 INFO    MainThread:323744 [wandb_run.py:_redirect():1497] redirect: SettingsConsole.REDIRECT
+2021-07-13 00:49:12,908 INFO    MainThread:323744 [wandb_run.py:_redirect():1502] Redirecting console.
+2021-07-13 00:49:12,910 INFO    MainThread:323744 [wandb_run.py:_redirect():1558] Redirects installed.
+2021-07-13 00:49:12,910 INFO    MainThread:323744 [wandb_init.py:init():554] run started, returning control to user process
+2021-07-13 00:49:12,916 INFO    MainThread:323744 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 5000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul13_00-48-19_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 500, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 20000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 92768, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': True, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''}
+2021-07-13 00:49:12,917 INFO    MainThread:323744 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
+2021-07-13 00:49:12,919 INFO    MainThread:323744 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 64, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False}
+2021-07-13 00:51:51,794 INFO    MainThread:323744 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 1
+2021-07-13 00:51:51,796 INFO    MainThread:323744 [wandb_run.py:_restore():1565] restore
+2021-07-13 00:51:54,324 INFO    MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 1417
+  total_bytes: 1417
+}
+2021-07-13 00:51:54,591 INFO    MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 1417
+  total_bytes: 1417
+}
+2021-07-13 00:51:54,812 INFO    MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 4
+}
+pusher_stats {
+  uploaded_bytes: 1417
+  total_bytes: 40394
+}
+2021-07-13 00:51:54,915 INFO    MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 1417
+  total_bytes: 40396
+}
+2021-07-13 00:51:55,017 INFO    MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:51:55,119 INFO    MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:51:55,221 INFO    MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:51:55,323 INFO    MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:51:55,425 INFO    MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:51:55,772 INFO    MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:51:55,875 INFO    MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: done: true
+exit_result {
+}
+file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:51:57,265 INFO    MainThread:323744 [wandb_run.py:_show_files():1937] logging synced files

wandb/run-20210713_004910-3mu9pog5/run-3mu9pog5.wandb ADDED Viewed

Binary file (37.4 kB). View file

wandb/run-20210713_005301-2ilkub1o/files/config.yaml ADDED Viewed

	@@ -0,0 +1,307 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.10.33
+    framework: huggingface
+    huggingface_version: 4.9.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    python_version: 3.8.10
+    t:
+      1:
+      - 3
+      - 11
+      2:
+      - 3
+      - 11
+      4: 3.8.10
+      5: 0.10.33
+      6: 4.9.0.dev0
+      8:
+      - 5
+adafactor:
+  desc: null
+  value: true
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.98
+adam_epsilon:
+  desc: null
+  value: 1.0e-08
+cache_dir:
+  desc: null
+  value: null
+config_name:
+  desc: null
+  value: ./
+dataloader_drop_last:
+  desc: null
+  value: false
+dataloader_num_workers:
+  desc: null
+  value: 0
+dataloader_pin_memory:
+  desc: null
+  value: true
+dataset_config_name:
+  desc: null
+  value: null
+dataset_name:
+  desc: null
+  value: null
+ddp_find_unused_parameters:
+  desc: null
+  value: null
+debug:
+  desc: null
+  value: []
+deepspeed:
+  desc: null
+  value: null
+disable_tqdm:
+  desc: null
+  value: false
+do_eval:
+  desc: null
+  value: false
+do_predict:
+  desc: null
+  value: false
+do_train:
+  desc: null
+  value: false
+dtype:
+  desc: null
+  value: bfloat16
+eval_accumulation_steps:
+  desc: null
+  value: null
+eval_steps:
+  desc: null
+  value: 92768
+evaluation_strategy:
+  desc: null
+  value: IntervalStrategy.NO
+fp16:
+  desc: null
+  value: false
+fp16_backend:
+  desc: null
+  value: auto
+fp16_full_eval:
+  desc: null
+  value: false
+fp16_opt_level:
+  desc: null
+  value: O1
+gradient_accumulation_steps:
+  desc: null
+  value: 8
+greater_is_better:
+  desc: null
+  value: null
+group_by_length:
+  desc: null
+  value: false
+ignore_data_skip:
+  desc: null
+  value: false
+label_names:
+  desc: null
+  value: null
+label_smoothing_factor:
+  desc: null
+  value: 0.0
+learning_rate:
+  desc: null
+  value: 5.0e-05
+length_column_name:
+  desc: null
+  value: length
+line_by_line:
+  desc: null
+  value: false
+load_best_model_at_end:
+  desc: null
+  value: false
+local_rank:
+  desc: null
+  value: -1
+log_level:
+  desc: null
+  value: -1
+log_level_replica:
+  desc: null
+  value: -1
+log_on_each_node:
+  desc: null
+  value: true
+logging_dir:
+  desc: null
+  value: ./runs/Jul13_00-52-13_t1v-n-f5c06ea1-w-0
+logging_first_step:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 500
+logging_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+lr_scheduler_type:
+  desc: null
+  value: SchedulerType.LINEAR
+max_grad_norm:
+  desc: null
+  value: 1.0
+max_seq_length:
+  desc: null
+  value: 4096
+max_steps:
+  desc: null
+  value: -1
+metric_for_best_model:
+  desc: null
+  value: null
+mlm_probability:
+  desc: null
+  value: 0.15
+model_name_or_path:
+  desc: null
+  value: null
+model_type:
+  desc: null
+  value: big_bird
+mp_parameters:
+  desc: null
+  value: ''
+no_cuda:
+  desc: null
+  value: false
+num_train_epochs:
+  desc: null
+  value: 5.0
+output_dir:
+  desc: null
+  value: ./
+overwrite_cache:
+  desc: null
+  value: false
+overwrite_output_dir:
+  desc: null
+  value: true
+pad_to_max_length:
+  desc: null
+  value: false
+past_index:
+  desc: null
+  value: -1
+per_device_eval_batch_size:
+  desc: null
+  value: 4
+per_device_train_batch_size:
+  desc: null
+  value: 4
+per_gpu_eval_batch_size:
+  desc: null
+  value: null
+per_gpu_train_batch_size:
+  desc: null
+  value: null
+prediction_loss_only:
+  desc: null
+  value: false
+preprocessing_num_workers:
+  desc: null
+  value: 64
+push_to_hub:
+  desc: null
+  value: true
+push_to_hub_model_id:
+  desc: null
+  value: ''
+push_to_hub_organization:
+  desc: null
+  value: null
+push_to_hub_token:
+  desc: null
+  value: null
+remove_unused_columns:
+  desc: null
+  value: true
+report_to:
+  desc: null
+  value:
+  - tensorboard
+  - wandb
+resume_from_checkpoint:
+  desc: null
+  value: null
+run_name:
+  desc: null
+  value: ./
+save_on_each_node:
+  desc: null
+  value: false
+save_steps:
+  desc: null
+  value: 20000
+save_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+save_total_limit:
+  desc: null
+  value: 5
+seed:
+  desc: null
+  value: 42
+sharded_ddp:
+  desc: null
+  value: []
+skip_memory_metrics:
+  desc: null
+  value: true
+tokenizer_name:
+  desc: null
+  value: ./
+tpu_metrics_debug:
+  desc: null
+  value: false
+tpu_num_cores:
+  desc: null
+  value: null
+train_file:
+  desc: null
+  value: null
+train_ref_file:
+  desc: null
+  value: null
+use_fast_tokenizer:
+  desc: null
+  value: true
+use_legacy_prediction_loop:
+  desc: null
+  value: false
+validation_file:
+  desc: null
+  value: null
+validation_ref_file:
+  desc: null
+  value: null
+validation_split_percentage:
+  desc: null
+  value: 5
+warmup_ratio:
+  desc: null
+  value: 0.0
+warmup_steps:
+  desc: null
+  value: 5000
+weight_decay:
+  desc: null
+  value: 0.0095

wandb/run-20210713_005301-2ilkub1o/files/output.log ADDED Viewed

	@@ -0,0 +1,376 @@

+/home/dat/pino/lib/python3.8/site-packages/jax/_src/numpy/lax_numpy.py:3114: UserWarning: Explicitly requested dtype <class 'jax._src.numpy.lax_numpy.int64'> requested in zeros is not available, and will be truncated to dtype int32. To enable more dtypes, set the jax_enable_x64 configuration option or the JAX_ENABLE_X64 shell environment variable. See https://github.com/google/jax#current-gotchas for more.
+  lax._check_user_dtype_supported(dtype, "zeros")
+/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:382: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
+  warnings.warn(
+/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:369: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.
+  warnings.warn(
+Epoch ... (1/5):   0%|                                    | 0/5 [00:00<?, ?it/s]
+Epoch ... (1/5):   0%|                                    | 0/5 [02:23<?, ?it/s]
+Traceback (most recent call last):
+  File "./run_mlm_flax.py", line 709, in <module>
+    state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
+    return fun(*args, **kwargs)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/api.py", line 1647, in f_pmapped
+    out = pxla.xla_pmap(
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1620, in bind
+    return call_bind(self, fun, *args, **params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1551, in call_bind
+    outs = primitive.process(top_trace, fun, tracers, params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1623, in process
+    return trace.process_map(self, fun, tracers, params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 606, in process_call
+    return primitive.impl(f, *tracers, **params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 624, in xla_pmap_impl
+    compiled_fun, fingerprint = parallel_callable(fun, backend, axis_name, axis_size,
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/linear_util.py", line 262, in memoized_fun
+    ans = call(fun, *args)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 899, in parallel_callable
+    compiled = xla.backend_compile(backend, built, compile_options)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
+    return backend.compile(built_c, compile_options=options)
+jax._src.traceback_util.UnfilteredStackTrace: RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.61G of 15.48G hbm. Exceeded hbm capacity by 5.13G.
+Total hbm usage >= 21.13G:
+    reserved        530.00M
+    program          20.61G
+    arguments            0B
+Output size 0B; shares 0B with arguments.
+Program hbm requirement 20.61G:
+    global           900.0K
+    scoped           924.0K
+    HLO temp         20.61G (63.0% utilization: Unpadded (12.43G) Padded (19.71G), 4.4% fragmentation (918.84M))
+  Largest program allocations in hbm:
+  1. Size: 1.54G
+     Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n                              precision=None\n                              preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
+     Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
+     Unpadded size: 1.54G
+     Extra memory due to padding: 64.0K (1.0x expansion)
+     XLA label: %fusion.3615.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %get-tuple-element.22628, f32[768]{0:T(1024)} %fusion.10158, f32[768]{0:T(1024)} %fusion.10159, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.20129, f32[...
+     Allocation type: HLO temp
+     ==========================
+  2. Size: 360.00M
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2444.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2444.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+  3. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2454.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2804, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7916, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  4. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2453.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2803, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7915, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  5. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2452.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2802, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7914, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  6. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2451.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2801, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7913, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  7. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2445 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2795, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7907, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.20342, f32[4,12,60,64,192]{3,4,2,1...
+     Allocation type: HLO temp
+     ==========================
+  8. Size: 360.00M
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2443.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2443.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+  9. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2450.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2800, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7912, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  10. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2449.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2799, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7911, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  11. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2448.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2798, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7910, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  12. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2447.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2797, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7909, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  13. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2446.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2796, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7908, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  14. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2689.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2964), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  15. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2690.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2962), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  16. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2688.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2966), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  17. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2691.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14230, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2960), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  18. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2692.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14164, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2958), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  19. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2693.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14098, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2956), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  20. Size: 270.00M
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2616.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.2616.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+The stack trace below excludes JAX-internal frames.
+The preceding is the original exception that occurred, unmodified.
+--------------------
+The above exception was the direct cause of the following exception:
+Traceback (most recent call last):
+  File "./run_mlm_flax.py", line 709, in <module>
+    state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
+    return backend.compile(built_c, compile_options=options)
+RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.61G of 15.48G hbm. Exceeded hbm capacity by 5.13G.
+Total hbm usage >= 21.13G:
+    reserved        530.00M
+    program          20.61G
+    arguments            0B
+Output size 0B; shares 0B with arguments.
+Program hbm requirement 20.61G:
+    global           900.0K
+    scoped           924.0K
+    HLO temp         20.61G (63.0% utilization: Unpadded (12.43G) Padded (19.71G), 4.4% fragmentation (918.84M))
+  Largest program allocations in hbm:
+  1. Size: 1.54G
+     Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n                              precision=None\n                              preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
+     Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
+     Unpadded size: 1.54G
+     Extra memory due to padding: 64.0K (1.0x expansion)
+     XLA label: %fusion.3615.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %get-tuple-element.22628, f32[768]{0:T(1024)} %fusion.10158, f32[768]{0:T(1024)} %fusion.10159, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.20129, f32[...
+     Allocation type: HLO temp
+     ==========================
+  2. Size: 360.00M
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2444.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2444.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+  3. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2454.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2804, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7916, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  4. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2453.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2803, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7915, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  5. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2452.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2802, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7914, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  6. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2451.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2801, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7913, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  7. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2445 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2795, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7907, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.20342, f32[4,12,60,64,192]{3,4,2,1...
+     Allocation type: HLO temp
+     ==========================
+  8. Size: 360.00M
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2443.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2443.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+  9. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2450.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2800, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7912, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  10. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2449.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2799, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7911, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  11. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2448.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2798, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7910, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  12. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2447.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2797, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7909, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  13. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.2446.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2796, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7908, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
+     Allocation type: HLO temp
+     ==========================
+  14. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2689.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2964), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  15. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2690.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2962), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  16. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2688.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2966), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  17. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2691.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14230, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2960), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  18. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2692.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14164, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2958), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  19. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2693.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14098, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2956), kind=kO...
+     Allocation type: HLO temp
+     ==========================
+  20. Size: 270.00M
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.2616.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.2616.remat_compressed)
+     Allocation type: HLO temp
+     ==========================

wandb/run-20210713_005301-2ilkub1o/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,92 @@

+absl-py==0.13.0
+aiohttp==3.7.4.post0
+astunparse==1.6.3
+async-timeout==3.0.1
+attrs==21.2.0
+cachetools==4.2.2
+certifi==2021.5.30
+chardet==4.0.0
+chex==0.0.8
+click==8.0.1
+configparser==5.0.2
+cycler==0.10.0
+datasets==1.9.1.dev0
+dill==0.3.4
+dm-tree==0.1.6
+docker-pycreds==0.4.0
+filelock==3.0.12
+flatbuffers==1.12
+flax==0.3.4
+fsspec==2021.6.1
+gast==0.4.0
+gitdb==4.0.7
+gitpython==3.1.18
+google-auth-oauthlib==0.4.4
+google-auth==1.32.1
+google-pasta==0.2.0
+grpcio==1.34.1
+h5py==3.1.0
+huggingface-hub==0.0.12
+idna==2.10
+jax==0.2.16
+jaxlib==0.1.68
+joblib==1.0.1
+keras-nightly==2.5.0.dev2021032900
+keras-preprocessing==1.1.2
+kiwisolver==1.3.1
+libtpu-nightly==0.1.dev20210615
+markdown==3.3.4
+matplotlib==3.4.2
+msgpack==1.0.2
+multidict==5.1.0
+multiprocess==0.70.12.2
+numpy==1.19.5
+oauthlib==3.1.1
+opt-einsum==3.3.0
+optax==0.0.9
+packaging==21.0
+pandas==1.3.0
+pathtools==0.1.2
+pillow==8.3.1
+pip==20.0.2
+pkg-resources==0.0.0
+promise==2.3
+protobuf==3.17.3
+psutil==5.8.0
+pyarrow==4.0.1
+pyasn1-modules==0.2.8
+pyasn1==0.4.8
+pyparsing==2.4.7
+python-dateutil==2.8.1
+pytz==2021.1
+pyyaml==5.4.1
+regex==2021.7.6
+requests-oauthlib==1.3.0
+requests==2.25.1
+rsa==4.7.2
+sacremoses==0.0.45
+scipy==1.7.0
+sentry-sdk==1.3.0
+setuptools==44.0.0
+shortuuid==1.0.1
+six==1.15.0
+smmap==4.0.0
+subprocess32==3.5.4
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.0
+tensorboard==2.5.0
+tensorflow-estimator==2.5.0
+tensorflow==2.5.0
+termcolor==1.1.0
+tokenizers==0.10.3
+toolz==0.11.1
+tqdm==4.61.2
+transformers==4.9.0.dev0
+typing-extensions==3.7.4.3
+urllib3==1.26.6
+wandb==0.10.33
+werkzeug==2.0.1
+wheel==0.36.2
+wrapt==1.12.1
+xxhash==2.0.2
+yarl==1.6.3

wandb/run-20210713_005301-2ilkub1o/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+    "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
+    "python": "3.8.10",
+    "heartbeatAt": "2021-07-13T00:53:03.462705",
+    "startedAt": "2021-07-13T00:53:01.400550",
+    "docker": null,
+    "cpu_count": 96,
+    "cuda": null,
+    "args": [
+        "--push_to_hub",
+        "--output_dir=./",
+        "--model_type=big_bird",
+        "--config_name=./",
+        "--tokenizer_name=./",
+        "--max_seq_length=4096",
+        "--weight_decay=0.0095",
+        "--warmup_steps=5000",
+        "--overwrite_output_dir",
+        "--adam_beta1=0.9",
+        "--adam_beta2=0.98",
+        "--logging_steps=500",
+        "--eval_steps=92768",
+        "--num_train_epochs=5",
+        "--preprocessing_num_workers=64",
+        "--save_steps=20000",
+        "--adafactor",
+        "--learning_rate=5e-5",
+        "--per_device_train_batch_size=4",
+        "--per_device_eval_batch_size=4",
+        "--save_total_limit=5",
+        "--dtype=bfloat16",
+        "--gradient_accumulation_steps=8"
+    ],
+    "state": "running",
+    "program": "./run_mlm_flax.py",
+    "codePath": "run_mlm_flax.py",
+    "git": {
+        "remote": "https://huggingface.co/flax-community/pino-roberta-base",
+        "commit": "4229c91b780cf07115cc6d04c16e393b0d2f508c"
+    },
+    "email": null,
+    "root": "/home/dat/pino-roberta-base",
+    "host": "t1v-n-f5c06ea1-w-0",
+    "username": "dat",
+    "executable": "/home/dat/pino/bin/python"
+}

wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

wandb/run-20210713_005301-2ilkub1o/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,168 @@

+2021-07-13 00:53:02,112 INFO    MainThread:327506 [internal.py:wandb_internal():88] W&B internal server running at pid: 327506, started at: 2021-07-13 00:53:02.112234
+2021-07-13 00:53:02,114 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: check_version
+2021-07-13 00:53:02,114 INFO    WriterThread:327506 [datastore.py:open_for_write():80] open: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/run-2ilkub1o.wandb
+2021-07-13 00:53:02,115 DEBUG   SenderThread:327506 [sender.py:send():179] send: header
+2021-07-13 00:53:02,116 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: check_version
+2021-07-13 00:53:02,154 DEBUG   SenderThread:327506 [sender.py:send():179] send: run
+2021-07-13 00:53:02,328 INFO    SenderThread:327506 [dir_watcher.py:__init__():168] watching files in: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files
+2021-07-13 00:53:02,329 INFO    SenderThread:327506 [sender.py:_start_run_threads():716] run started: 2ilkub1o with start time 1626137581
+2021-07-13 00:53:02,345 DEBUG   SenderThread:327506 [sender.py:send():179] send: summary
+2021-07-13 00:53:02,345 INFO    SenderThread:327506 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 00:53:02,346 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: run_start
+2021-07-13 00:53:03,330 INFO    Thread-8  :327506 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json
+2021-07-13 00:53:03,462 DEBUG   HandlerThread:327506 [meta.py:__init__():39] meta init
+2021-07-13 00:53:03,462 DEBUG   HandlerThread:327506 [meta.py:__init__():53] meta init done
+2021-07-13 00:53:03,462 DEBUG   HandlerThread:327506 [meta.py:probe():210] probe
+2021-07-13 00:53:03,463 DEBUG   HandlerThread:327506 [meta.py:_setup_git():200] setup git
+2021-07-13 00:53:03,492 DEBUG   HandlerThread:327506 [meta.py:_setup_git():207] setup git done
+2021-07-13 00:53:03,492 DEBUG   HandlerThread:327506 [meta.py:_save_pip():57] save pip
+2021-07-13 00:53:03,493 DEBUG   HandlerThread:327506 [meta.py:_save_pip():71] save pip done
+2021-07-13 00:53:03,493 DEBUG   HandlerThread:327506 [meta.py:probe():252] probe done
+2021-07-13 00:53:03,496 DEBUG   SenderThread:327506 [sender.py:send():179] send: files
+2021-07-13 00:53:03,496 INFO    SenderThread:327506 [sender.py:_save_file():841] saving file wandb-metadata.json with policy now
+2021-07-13 00:53:03,504 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:53:03,504 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:53:03,635 DEBUG   SenderThread:327506 [sender.py:send():179] send: config
+2021-07-13 00:53:03,636 DEBUG   SenderThread:327506 [sender.py:send():179] send: config
+2021-07-13 00:53:03,636 DEBUG   SenderThread:327506 [sender.py:send():179] send: config
+2021-07-13 00:53:03,952 INFO    Thread-11 :327506 [upload_job.py:push():137] Uploaded file /tmp/tmpi8r4kiyhwandb/3l6ji67i-wandb-metadata.json
+2021-07-13 00:53:04,330 INFO    Thread-8  :327506 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-metadata.json
+2021-07-13 00:53:04,330 INFO    Thread-8  :327506 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/requirements.txt
+2021-07-13 00:53:04,330 INFO    Thread-8  :327506 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
+2021-07-13 00:53:18,637 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:53:18,637 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:53:20,336 INFO    Thread-8  :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
+2021-07-13 00:53:22,336 INFO    Thread-8  :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
+2021-07-13 00:53:31,548 DEBUG   SenderThread:327506 [sender.py:send():179] send: stats
+2021-07-13 00:53:33,340 INFO    Thread-8  :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/config.yaml
+2021-07-13 00:53:33,769 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:53:33,769 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:53:48,899 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:53:48,899 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:54:01,629 DEBUG   SenderThread:327506 [sender.py:send():179] send: stats
+2021-07-13 00:54:04,032 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:54:04,032 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:54:12,355 INFO    Thread-8  :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
+2021-07-13 00:54:19,178 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:54:19,179 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:54:31,708 DEBUG   SenderThread:327506 [sender.py:send():179] send: stats
+2021-07-13 00:54:34,599 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:54:34,599 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:54:49,798 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:54:49,798 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:55:01,792 DEBUG   SenderThread:327506 [sender.py:send():179] send: stats
+2021-07-13 00:55:04,931 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:55:04,931 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:55:20,062 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:55:20,062 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:55:31,868 DEBUG   SenderThread:327506 [sender.py:send():179] send: stats
+2021-07-13 00:55:35,203 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:55:35,204 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:55:44,391 INFO    Thread-8  :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
+2021-07-13 00:55:45,566 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:55:45,567 DEBUG   SenderThread:327506 [sender.py:send():179] send: telemetry
+2021-07-13 00:55:45,567 DEBUG   SenderThread:327506 [sender.py:send():179] send: exit
+2021-07-13 00:55:45,567 INFO    SenderThread:327506 [sender.py:send_exit():287] handling exit code: 1
+2021-07-13 00:55:45,567 INFO    SenderThread:327506 [sender.py:send_exit():295] send defer
+2021-07-13 00:55:45,567 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:55:45,568 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:55:45,568 INFO    HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 0
+2021-07-13 00:55:45,568 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:55:45,568 INFO    SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 0
+2021-07-13 00:55:45,569 INFO    SenderThread:327506 [sender.py:transition_state():308] send defer: 1
+2021-07-13 00:55:45,569 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:55:45,569 INFO    HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 1
+2021-07-13 00:55:45,601 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:55:45,601 INFO    SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 1
+2021-07-13 00:55:45,601 INFO    SenderThread:327506 [sender.py:transition_state():308] send defer: 2
+2021-07-13 00:55:45,602 DEBUG   SenderThread:327506 [sender.py:send():179] send: stats
+2021-07-13 00:55:45,602 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:55:45,602 INFO    HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 2
+2021-07-13 00:55:45,602 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:55:45,602 INFO    SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 2
+2021-07-13 00:55:45,602 INFO    SenderThread:327506 [sender.py:transition_state():308] send defer: 3
+2021-07-13 00:55:45,603 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:55:45,603 INFO    HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 3
+2021-07-13 00:55:45,603 DEBUG   SenderThread:327506 [sender.py:send():179] send: summary
+2021-07-13 00:55:45,603 INFO    SenderThread:327506 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 00:55:45,603 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:55:45,604 INFO    SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 3
+2021-07-13 00:55:45,604 INFO    SenderThread:327506 [sender.py:transition_state():308] send defer: 4
+2021-07-13 00:55:45,604 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:55:45,604 INFO    HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 4
+2021-07-13 00:55:45,604 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:55:45,604 INFO    SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 4
+2021-07-13 00:55:45,670 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:55:45,784 INFO    SenderThread:327506 [sender.py:transition_state():308] send defer: 5
+2021-07-13 00:55:45,784 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:55:45,785 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:55:45,785 INFO    HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 5
+2021-07-13 00:55:45,785 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:55:45,785 INFO    SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 5
+2021-07-13 00:55:45,786 INFO    SenderThread:327506 [dir_watcher.py:finish():282] shutting down directory watcher
+2021-07-13 00:55:45,887 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:55:46,391 INFO    Thread-8  :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json
+2021-07-13 00:55:46,392 INFO    SenderThread:327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/config.yaml
+2021-07-13 00:55:46,392 INFO    SenderThread:327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
+2021-07-13 00:55:46,392 INFO    SenderThread:327506 [dir_watcher.py:finish():312] scan: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files
+2021-07-13 00:55:46,392 INFO    SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/requirements.txt requirements.txt
+2021-07-13 00:55:46,392 INFO    SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log output.log
+2021-07-13 00:55:46,392 INFO    SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-metadata.json wandb-metadata.json
+2021-07-13 00:55:46,392 INFO    SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/config.yaml config.yaml
+2021-07-13 00:55:46,392 INFO    SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json wandb-summary.json
+2021-07-13 00:55:46,393 INFO    SenderThread:327506 [sender.py:transition_state():308] send defer: 6
+2021-07-13 00:55:46,393 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:55:46,403 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:55:46,403 INFO    HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 6
+2021-07-13 00:55:46,405 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:55:46,405 INFO    SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 6
+2021-07-13 00:55:46,405 INFO    SenderThread:327506 [file_pusher.py:finish():177] shutting down file pusher
+2021-07-13 00:55:46,495 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:55:46,496 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:55:46,598 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:55:46,598 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:55:46,700 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:55:46,700 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:55:46,802 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:55:46,802 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:55:46,867 INFO    Thread-14 :327506 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/config.yaml
+2021-07-13 00:55:46,874 INFO    Thread-15 :327506 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json
+2021-07-13 00:55:46,876 INFO    Thread-13 :327506 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
+2021-07-13 00:55:46,904 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:55:46,905 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:55:46,935 INFO    Thread-12 :327506 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/requirements.txt
+2021-07-13 00:55:47,007 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:55:47,007 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:55:47,109 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:55:47,109 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:55:47,135 INFO    Thread-7  :327506 [sender.py:transition_state():308] send defer: 7
+2021-07-13 00:55:47,136 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:55:47,136 INFO    HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 7
+2021-07-13 00:55:47,136 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:55:47,136 INFO    SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 7
+2021-07-13 00:55:47,211 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:55:47,415 INFO    SenderThread:327506 [sender.py:transition_state():308] send defer: 8
+2021-07-13 00:55:47,416 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:55:47,416 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:55:47,416 INFO    HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 8
+2021-07-13 00:55:47,416 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:55:47,417 INFO    SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 8
+2021-07-13 00:55:47,417 INFO    SenderThread:327506 [sender.py:transition_state():308] send defer: 9
+2021-07-13 00:55:47,417 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
+2021-07-13 00:55:47,417 INFO    HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 9
+2021-07-13 00:55:47,417 DEBUG   SenderThread:327506 [sender.py:send():179] send: final
+2021-07-13 00:55:47,417 DEBUG   SenderThread:327506 [sender.py:send():179] send: footer
+2021-07-13 00:55:47,417 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: defer
+2021-07-13 00:55:47,418 INFO    SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 9
+2021-07-13 00:55:47,518 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
+2021-07-13 00:55:47,518 DEBUG   SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
+2021-07-13 00:55:47,518 INFO    SenderThread:327506 [file_pusher.py:join():182] waiting for file pusher
+2021-07-13 00:55:47,520 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: get_summary
+2021-07-13 00:55:47,521 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: sampled_history
+2021-07-13 00:55:47,521 DEBUG   HandlerThread:327506 [handler.py:handle_request():124] handle_request: shutdown
+2021-07-13 00:55:47,521 INFO    HandlerThread:327506 [handler.py:finish():638] shutting down handler
+2021-07-13 00:55:48,418 INFO    WriterThread:327506 [datastore.py:close():288] close: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/run-2ilkub1o.wandb
+2021-07-13 00:55:48,518 INFO    SenderThread:327506 [sender.py:finish():945] shutting down sender
+2021-07-13 00:55:48,519 INFO    SenderThread:327506 [file_pusher.py:finish():177] shutting down file pusher
+2021-07-13 00:55:48,519 INFO    SenderThread:327506 [file_pusher.py:join():182] waiting for file pusher
+2021-07-13 00:55:48,521 INFO    MainThread:327506 [internal.py:handle_exit():78] Internal process exited

wandb/run-20210713_005301-2ilkub1o/logs/debug.log ADDED Viewed

	@@ -0,0 +1,127 @@

+2021-07-13 00:53:01,402 INFO    MainThread:325900 [wandb_setup.py:_flush():69] setting env: {}
+2021-07-13 00:53:01,402 INFO    MainThread:325900 [wandb_setup.py:_flush():69] setting login settings: {}
+2021-07-13 00:53:01,402 INFO    MainThread:325900 [wandb_init.py:_log_setup():337] Logging user logs to /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/logs/debug.log
+2021-07-13 00:53:01,402 INFO    MainThread:325900 [wandb_init.py:_log_setup():338] Logging internal logs to /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/logs/debug-internal.log
+2021-07-13 00:53:01,402 INFO    MainThread:325900 [wandb_init.py:init():370] calling init triggers
+2021-07-13 00:53:01,402 INFO    MainThread:325900 [wandb_init.py:init():375] wandb.init called with sweep_config: {}
+config: {}
+2021-07-13 00:53:01,402 INFO    MainThread:325900 [wandb_init.py:init():419] starting backend
+2021-07-13 00:53:01,402 INFO    MainThread:325900 [backend.py:_multiprocessing_setup():70] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2021-07-13 00:53:01,457 INFO    MainThread:325900 [backend.py:ensure_launched():135] starting backend process...
+2021-07-13 00:53:01,509 INFO    MainThread:325900 [backend.py:ensure_launched():139] started backend process with pid: 327506
+2021-07-13 00:53:01,511 INFO    MainThread:325900 [wandb_init.py:init():424] backend started and connected
+2021-07-13 00:53:01,514 INFO    MainThread:325900 [wandb_init.py:init():472] updated telemetry
+2021-07-13 00:53:01,515 INFO    MainThread:325900 [wandb_init.py:init():491] communicating current version
+2021-07-13 00:53:02,153 INFO    MainThread:325900 [wandb_init.py:init():496] got version response
+2021-07-13 00:53:02,153 INFO    MainThread:325900 [wandb_init.py:init():504] communicating run to backend with 30 second timeout
+2021-07-13 00:53:02,345 INFO    MainThread:325900 [wandb_init.py:init():529] starting run threads in backend
+2021-07-13 00:53:03,501 INFO    MainThread:325900 [wandb_run.py:_console_start():1623] atexit reg
+2021-07-13 00:53:03,501 INFO    MainThread:325900 [wandb_run.py:_redirect():1497] redirect: SettingsConsole.REDIRECT
+2021-07-13 00:53:03,502 INFO    MainThread:325900 [wandb_run.py:_redirect():1502] Redirecting console.
+2021-07-13 00:53:03,504 INFO    MainThread:325900 [wandb_run.py:_redirect():1558] Redirects installed.
+2021-07-13 00:53:03,504 INFO    MainThread:325900 [wandb_init.py:init():554] run started, returning control to user process
+2021-07-13 00:53:03,510 INFO    MainThread:325900 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 5000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul13_00-52-13_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 500, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 20000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 92768, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': True, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''}
+2021-07-13 00:53:03,512 INFO    MainThread:325900 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
+2021-07-13 00:53:03,513 INFO    MainThread:325900 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 64, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False}
+2021-07-13 00:55:43,384 INFO    MainThread:325900 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 1
+2021-07-13 00:55:43,385 INFO    MainThread:325900 [wandb_run.py:_restore():1565] restore
+2021-07-13 00:55:45,569 INFO    MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 1417
+  total_bytes: 1417
+}
+2021-07-13 00:55:45,785 INFO    MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 1417
+  total_bytes: 1417
+}
+2021-07-13 00:55:46,394 INFO    MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 4
+}
+pusher_stats {
+  uploaded_bytes: 1417
+  total_bytes: 40394
+}
+2021-07-13 00:55:46,496 INFO    MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 1417
+  total_bytes: 40396
+}
+2021-07-13 00:55:46,598 INFO    MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:55:46,701 INFO    MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:55:46,803 INFO    MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:55:46,905 INFO    MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:55:47,008 INFO    MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:55:47,109 INFO    MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:55:47,416 INFO    MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:55:47,519 INFO    MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: done: true
+exit_result {
+}
+file_counts {
+  wandb_count: 5
+}
+pusher_stats {
+  uploaded_bytes: 40396
+  total_bytes: 40396
+}
+2021-07-13 00:55:48,779 INFO    MainThread:325900 [wandb_run.py:_show_files():1937] logging synced files

wandb/run-20210713_005301-2ilkub1o/run-2ilkub1o.wandb ADDED Viewed

Binary file (37.4 kB). View file

wandb/run-20210713_005751-1wnn0lyf/files/config.yaml ADDED Viewed

	@@ -0,0 +1,304 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.10.33
+    framework: huggingface
+    huggingface_version: 4.9.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    python_version: 3.8.10
+    t:
+      1:
+      - 3
+      - 11
+      4: 3.8.10
+      5: 0.10.33
+      6: 4.9.0.dev0
+      8:
+      - 5
+adafactor:
+  desc: null
+  value: false
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.98
+adam_epsilon:
+  desc: null
+  value: 1.0e-08
+cache_dir:
+  desc: null
+  value: null
+config_name:
+  desc: null
+  value: ./
+dataloader_drop_last:
+  desc: null
+  value: false
+dataloader_num_workers:
+  desc: null
+  value: 0
+dataloader_pin_memory:
+  desc: null
+  value: true
+dataset_config_name:
+  desc: null
+  value: null
+dataset_name:
+  desc: null
+  value: null
+ddp_find_unused_parameters:
+  desc: null
+  value: null
+debug:
+  desc: null
+  value: []
+deepspeed:
+  desc: null
+  value: null
+disable_tqdm:
+  desc: null
+  value: false
+do_eval:
+  desc: null
+  value: false
+do_predict:
+  desc: null
+  value: false
+do_train:
+  desc: null
+  value: false
+dtype:
+  desc: null
+  value: bfloat16
+eval_accumulation_steps:
+  desc: null
+  value: null
+eval_steps:
+  desc: null
+  value: 92768
+evaluation_strategy:
+  desc: null
+  value: IntervalStrategy.NO
+fp16:
+  desc: null
+  value: false
+fp16_backend:
+  desc: null
+  value: auto
+fp16_full_eval:
+  desc: null
+  value: false
+fp16_opt_level:
+  desc: null
+  value: O1
+gradient_accumulation_steps:
+  desc: null
+  value: 1
+greater_is_better:
+  desc: null
+  value: null
+group_by_length:
+  desc: null
+  value: false
+ignore_data_skip:
+  desc: null
+  value: false
+label_names:
+  desc: null
+  value: null
+label_smoothing_factor:
+  desc: null
+  value: 0.0
+learning_rate:
+  desc: null
+  value: 5.0e-05
+length_column_name:
+  desc: null
+  value: length
+line_by_line:
+  desc: null
+  value: false
+load_best_model_at_end:
+  desc: null
+  value: false
+local_rank:
+  desc: null
+  value: -1
+log_level:
+  desc: null
+  value: -1
+log_level_replica:
+  desc: null
+  value: -1
+log_on_each_node:
+  desc: null
+  value: true
+logging_dir:
+  desc: null
+  value: ./runs/Jul13_00-57-01_t1v-n-f5c06ea1-w-0
+logging_first_step:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 500
+logging_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+lr_scheduler_type:
+  desc: null
+  value: SchedulerType.LINEAR
+max_grad_norm:
+  desc: null
+  value: 1.0
+max_seq_length:
+  desc: null
+  value: 4096
+max_steps:
+  desc: null
+  value: -1
+metric_for_best_model:
+  desc: null
+  value: null
+mlm_probability:
+  desc: null
+  value: 0.15
+model_name_or_path:
+  desc: null
+  value: null
+model_type:
+  desc: null
+  value: big_bird
+mp_parameters:
+  desc: null
+  value: ''
+no_cuda:
+  desc: null
+  value: false
+num_train_epochs:
+  desc: null
+  value: 5.0
+output_dir:
+  desc: null
+  value: ./
+overwrite_cache:
+  desc: null
+  value: false
+overwrite_output_dir:
+  desc: null
+  value: true
+pad_to_max_length:
+  desc: null
+  value: false
+past_index:
+  desc: null
+  value: -1
+per_device_eval_batch_size:
+  desc: null
+  value: 4
+per_device_train_batch_size:
+  desc: null
+  value: 4
+per_gpu_eval_batch_size:
+  desc: null
+  value: null
+per_gpu_train_batch_size:
+  desc: null
+  value: null
+prediction_loss_only:
+  desc: null
+  value: false
+preprocessing_num_workers:
+  desc: null
+  value: 64
+push_to_hub:
+  desc: null
+  value: true
+push_to_hub_model_id:
+  desc: null
+  value: ''
+push_to_hub_organization:
+  desc: null
+  value: null
+push_to_hub_token:
+  desc: null
+  value: null
+remove_unused_columns:
+  desc: null
+  value: true
+report_to:
+  desc: null
+  value:
+  - tensorboard
+  - wandb
+resume_from_checkpoint:
+  desc: null
+  value: null
+run_name:
+  desc: null
+  value: ./
+save_on_each_node:
+  desc: null
+  value: false
+save_steps:
+  desc: null
+  value: 20000
+save_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+save_total_limit:
+  desc: null
+  value: 5
+seed:
+  desc: null
+  value: 42
+sharded_ddp:
+  desc: null
+  value: []
+skip_memory_metrics:
+  desc: null
+  value: true
+tokenizer_name:
+  desc: null
+  value: ./
+tpu_metrics_debug:
+  desc: null
+  value: false
+tpu_num_cores:
+  desc: null
+  value: null
+train_file:
+  desc: null
+  value: null
+train_ref_file:
+  desc: null
+  value: null
+use_fast_tokenizer:
+  desc: null
+  value: true
+use_legacy_prediction_loop:
+  desc: null
+  value: false
+validation_file:
+  desc: null
+  value: null
+validation_ref_file:
+  desc: null
+  value: null
+validation_split_percentage:
+  desc: null
+  value: 5
+warmup_ratio:
+  desc: null
+  value: 0.0
+warmup_steps:
+  desc: null
+  value: 5000
+weight_decay:
+  desc: null
+  value: 0.0095

wandb/run-20210713_005751-1wnn0lyf/files/output.log ADDED Viewed

	@@ -0,0 +1,216 @@

+/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:382: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
+  warnings.warn(
+/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:369: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.
+  warnings.warn(
+Epoch ... (1/5):   0%|                                    | 0/5 [00:00<?, ?it/s]
+Traceback (most recent call last):                    | 0/46383 [00:00<?, ?it/s]
+  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
+    self.run()
+  File "/usr/lib/python3.8/threading.py", line 870, in run
+    self._target(*self._args, **self._kwargs)
+  File "/home/dat/pino/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 183, in check_network_status
+    status_response = self._interface.communicate_network_status()
+  File "/home/dat/pino/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 755, in communicate_network_status
+    resp = self._communicate(req, timeout=timeout, local=True)
+  File "/home/dat/pino/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate
+    return self._communicate_async(rec, local=local).get(timeout=timeout)
+  File "/home/dat/pino/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async
+    raise Exception("The wandb backend process has shutdown")
+Exception: The wandb backend process has shutdown
+Training...:   0%|                                    | 0/46383 [01:25<?, ?it/s]
+Epoch ... (1/5):   0%|                                    | 0/5 [02:13<?, ?it/s]
+Traceback (most recent call last):
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
+    return fun(*args, **kwargs)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/api.py", line 1647, in f_pmapped
+    out = pxla.xla_pmap(
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1620, in bind
+    return call_bind(self, fun, *args, **params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1551, in call_bind
+    outs = primitive.process(top_trace, fun, tracers, params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1623, in process
+    return trace.process_map(self, fun, tracers, params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 606, in process_call
+    return primitive.impl(f, *tracers, **params)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 624, in xla_pmap_impl
+    compiled_fun, fingerprint = parallel_callable(fun, backend, axis_name, axis_size,
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/linear_util.py", line 262, in memoized_fun
+    ans = call(fun, *args)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 899, in parallel_callable
+    compiled = xla.backend_compile(backend, built, compile_options)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
+    return backend.compile(built_c, compile_options=options)
+RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.30G of 15.48G hbm. Exceeded hbm capacity by 4.82G.
+Total hbm usage >= 20.82G:
+    reserved        530.00M
+    program          20.30G
+    arguments            0B
+Output size 0B; shares 0B with arguments.
+Program hbm requirement 20.30G:
+    global           660.0K
+    scoped           125.0K
+    HLO temp         20.30G (63.5% utilization: Unpadded (12.44G) Padded (19.60G), 3.5% fragmentation (717.54M))
+  Largest program allocations in hbm:
+  1. Size: 1.54G
+     Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n                              precision=None\n                              preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
+     Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
+     Unpadded size: 1.54G
+     Extra memory due to padding: 64.0K (1.0x expansion)
+     XLA label: %fusion.1304.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %copy.16213, f32[768]{0:T(1024)} %fusion.8859, f32[768]{0:T(1024)} %fusion.8860, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.16597, f32[4,4096]{1,0:T(4...
+     Allocation type: HLO temp
+     ==========================
+  2. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.135 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.485, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5710, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.16812, f32[4,12,60,64,192]{3,4,2,1,0...
+     Allocation type: HLO temp
+     ==========================
+  3. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.144.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.494, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5719, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
+     Allocation type: HLO temp
+     ==========================
+  4. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.143.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.493, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5718, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
+     Allocation type: HLO temp
+     ==========================
+  5. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.142.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.492, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5717, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
+     Allocation type: HLO temp
+     ==========================
+  6. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.141.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.491, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5716, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
+     Allocation type: HLO temp
+     ==========================
+  7. Size: 360.00M
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.134.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.134.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+  8. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.140.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.490, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5715, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
+     Allocation type: HLO temp
+     ==========================
+  9. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.139.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.489, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5714, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
+     Allocation type: HLO temp
+     ==========================
+  10. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.138.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.488, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5713, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
+     Allocation type: HLO temp
+     ==========================
+  11. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.137.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.487, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5712, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
+     Allocation type: HLO temp
+     ==========================
+  12. Size: 360.00M
+     Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.136.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.486, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5711, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
+     Allocation type: HLO temp
+     ==========================
+  13. Size: 360.00M
+     Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
+     Unpadded size: 180.00M
+     Extra memory due to padding: 180.00M (2.0x expansion)
+     XLA label: %fusion.133.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.133.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+  14. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.378.remat5 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.17038, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.655), kind=kOut...
+     Allocation type: HLO temp
+     ==========================
+  15. Size: 270.00M
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.310.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.310.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+  16. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.386.remat6 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.17038, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.13900, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.639), kind=kOut...
+     Allocation type: HLO temp
+     ==========================
+  17. Size: 270.00M
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.326.remat_uncompressed.remat2 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.326.remat_compressed)
+     Allocation type: HLO temp
+     ==========================
+  18. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=591
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.10361 = (f32[4,12,60,64]{3,2,1,0:T(8,128)}, f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}) fusion(s32[4,12,62,64,192]{3,4,2,1,0:T(8,128)} %get-tuple-element.18295, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14494, bf16[4,12,60,192,64]{3,2,1,0,4:T...
+     Allocation type: HLO temp
+     ==========================
+  19. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.380.remat5 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.17038, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.651), kind=kOut...
+     Allocation type: HLO temp
+     ==========================
+  20. Size: 270.00M
+     Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n                                                precision=None\n                                                preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
+     Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
+     Unpadded size: 135.00M
+     Extra memory due to padding: 135.00M (2.0x expansion)
+     XLA label: %fusion.379.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.17038, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.653), kind=kOut...
+     Allocation type: HLO temp
+     ==========================
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "./run_mlm_flax.py", line 709, in <module>
+    state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+  File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
+    return fun(*args, **kwargs)
+KeyboardInterrupt

wandb/run-20210713_005751-1wnn0lyf/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,92 @@

+absl-py==0.13.0
+aiohttp==3.7.4.post0
+astunparse==1.6.3
+async-timeout==3.0.1
+attrs==21.2.0
+cachetools==4.2.2
+certifi==2021.5.30
+chardet==4.0.0
+chex==0.0.8
+click==8.0.1
+configparser==5.0.2
+cycler==0.10.0
+datasets==1.9.1.dev0
+dill==0.3.4
+dm-tree==0.1.6
+docker-pycreds==0.4.0
+filelock==3.0.12
+flatbuffers==1.12
+flax==0.3.4
+fsspec==2021.6.1
+gast==0.4.0
+gitdb==4.0.7
+gitpython==3.1.18
+google-auth-oauthlib==0.4.4
+google-auth==1.32.1
+google-pasta==0.2.0
+grpcio==1.34.1
+h5py==3.1.0
+huggingface-hub==0.0.12
+idna==2.10
+jax==0.2.16
+jaxlib==0.1.68
+joblib==1.0.1
+keras-nightly==2.5.0.dev2021032900
+keras-preprocessing==1.1.2
+kiwisolver==1.3.1
+libtpu-nightly==0.1.dev20210615
+markdown==3.3.4
+matplotlib==3.4.2
+msgpack==1.0.2
+multidict==5.1.0
+multiprocess==0.70.12.2
+numpy==1.19.5
+oauthlib==3.1.1
+opt-einsum==3.3.0
+optax==0.0.9
+packaging==21.0
+pandas==1.3.0
+pathtools==0.1.2
+pillow==8.3.1
+pip==20.0.2
+pkg-resources==0.0.0
+promise==2.3
+protobuf==3.17.3
+psutil==5.8.0
+pyarrow==4.0.1
+pyasn1-modules==0.2.8
+pyasn1==0.4.8
+pyparsing==2.4.7
+python-dateutil==2.8.1
+pytz==2021.1
+pyyaml==5.4.1
+regex==2021.7.6
+requests-oauthlib==1.3.0
+requests==2.25.1
+rsa==4.7.2
+sacremoses==0.0.45
+scipy==1.7.0
+sentry-sdk==1.3.0
+setuptools==44.0.0
+shortuuid==1.0.1
+six==1.15.0
+smmap==4.0.0
+subprocess32==3.5.4
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.0
+tensorboard==2.5.0
+tensorflow-estimator==2.5.0
+tensorflow==2.5.0
+termcolor==1.1.0
+tokenizers==0.10.3
+toolz==0.11.1
+tqdm==4.61.2
+transformers==4.9.0.dev0
+typing-extensions==3.7.4.3
+urllib3==1.26.6
+wandb==0.10.33
+werkzeug==2.0.1
+wheel==0.36.2
+wrapt==1.12.1
+xxhash==2.0.2
+yarl==1.6.3

wandb/run-20210713_005751-1wnn0lyf/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+    "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
+    "python": "3.8.10",
+    "heartbeatAt": "2021-07-13T00:57:53.965536",
+    "startedAt": "2021-07-13T00:57:51.918634",
+    "docker": null,
+    "cpu_count": 96,
+    "cuda": null,
+    "args": [
+        "--push_to_hub",
+        "--output_dir=./",
+        "--model_type=big_bird",
+        "--config_name=./",
+        "--tokenizer_name=./",
+        "--max_seq_length=4096",
+        "--weight_decay=0.0095",
+        "--warmup_steps=5000",
+        "--overwrite_output_dir",
+        "--adam_beta1=0.9",
+        "--adam_beta2=0.98",
+        "--logging_steps=500",
+        "--eval_steps=92768",
+        "--num_train_epochs=5",
+        "--preprocessing_num_workers=64",
+        "--save_steps=20000",
+        "--learning_rate=5e-5",
+        "--per_device_train_batch_size=4",
+        "--per_device_eval_batch_size=4",
+        "--save_total_limit=5",
+        "--dtype=bfloat16"
+    ],
+    "state": "running",
+    "program": "./run_mlm_flax.py",
+    "codePath": "run_mlm_flax.py",
+    "git": {
+        "remote": "https://huggingface.co/flax-community/pino-roberta-base",
+        "commit": "4229c91b780cf07115cc6d04c16e393b0d2f508c"
+    },
+    "email": null,
+    "root": "/home/dat/pino-roberta-base",
+    "host": "t1v-n-f5c06ea1-w-0",
+    "username": "dat",
+    "executable": "/home/dat/pino/bin/python"
+}

wandb/run-20210713_005751-1wnn0lyf/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

wandb/run-20210713_005751-1wnn0lyf/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,61 @@

+2021-07-13 00:57:52,645 INFO    MainThread:329334 [internal.py:wandb_internal():88] W&B internal server running at pid: 329334, started at: 2021-07-13 00:57:52.644860
+2021-07-13 00:57:52,647 DEBUG   HandlerThread:329334 [handler.py:handle_request():124] handle_request: check_version
+2021-07-13 00:57:52,647 INFO    WriterThread:329334 [datastore.py:open_for_write():80] open: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/run-1wnn0lyf.wandb
+2021-07-13 00:57:52,648 DEBUG   SenderThread:329334 [sender.py:send():179] send: header
+2021-07-13 00:57:52,648 DEBUG   SenderThread:329334 [sender.py:send_request():193] send_request: check_version
+2021-07-13 00:57:52,687 DEBUG   SenderThread:329334 [sender.py:send():179] send: run
+2021-07-13 00:57:52,862 INFO    SenderThread:329334 [dir_watcher.py:__init__():168] watching files in: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files
+2021-07-13 00:57:52,862 INFO    SenderThread:329334 [sender.py:_start_run_threads():716] run started: 1wnn0lyf with start time 1626137872
+2021-07-13 00:57:52,862 DEBUG   SenderThread:329334 [sender.py:send():179] send: summary
+2021-07-13 00:57:52,862 DEBUG   HandlerThread:329334 [handler.py:handle_request():124] handle_request: run_start
+2021-07-13 00:57:52,863 INFO    SenderThread:329334 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
+2021-07-13 00:57:53,865 INFO    Thread-8  :329334 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/wandb-summary.json
+2021-07-13 00:57:53,965 DEBUG   HandlerThread:329334 [meta.py:__init__():39] meta init
+2021-07-13 00:57:53,965 DEBUG   HandlerThread:329334 [meta.py:__init__():53] meta init done
+2021-07-13 00:57:53,965 DEBUG   HandlerThread:329334 [meta.py:probe():210] probe
+2021-07-13 00:57:53,966 DEBUG   HandlerThread:329334 [meta.py:_setup_git():200] setup git
+2021-07-13 00:57:53,996 DEBUG   HandlerThread:329334 [meta.py:_setup_git():207] setup git done
+2021-07-13 00:57:53,996 DEBUG   HandlerThread:329334 [meta.py:_save_pip():57] save pip
+2021-07-13 00:57:53,996 DEBUG   HandlerThread:329334 [meta.py:_save_pip():71] save pip done
+2021-07-13 00:57:53,996 DEBUG   HandlerThread:329334 [meta.py:probe():252] probe done
+2021-07-13 00:57:53,999 DEBUG   SenderThread:329334 [sender.py:send():179] send: files
+2021-07-13 00:57:53,999 INFO    SenderThread:329334 [sender.py:_save_file():841] saving file wandb-metadata.json with policy now
+2021-07-13 00:57:54,007 DEBUG   HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:57:54,007 DEBUG   SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:57:54,134 DEBUG   SenderThread:329334 [sender.py:send():179] send: config
+2021-07-13 00:57:54,135 DEBUG   SenderThread:329334 [sender.py:send():179] send: config
+2021-07-13 00:57:54,135 DEBUG   SenderThread:329334 [sender.py:send():179] send: config
+2021-07-13 00:57:54,460 INFO    Thread-11 :329334 [upload_job.py:push():137] Uploaded file /tmp/tmpbiuftyldwandb/b3fet9y4-wandb-metadata.json
+2021-07-13 00:57:54,864 INFO    Thread-8  :329334 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/output.log
+2021-07-13 00:57:54,864 INFO    Thread-8  :329334 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/wandb-metadata.json
+2021-07-13 00:57:54,864 INFO    Thread-8  :329334 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/requirements.txt
+2021-07-13 00:58:09,136 DEBUG   HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:58:09,136 DEBUG   SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:58:10,870 INFO    Thread-8  :329334 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/output.log
+2021-07-13 00:58:22,050 DEBUG   SenderThread:329334 [sender.py:send():179] send: stats
+2021-07-13 00:58:23,875 INFO    Thread-8  :329334 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/config.yaml
+2021-07-13 00:58:24,269 DEBUG   HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:58:24,269 DEBUG   SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:58:39,402 DEBUG   HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:58:39,403 DEBUG   SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:58:52,130 DEBUG   SenderThread:329334 [sender.py:send():179] send: stats
+2021-07-13 00:58:54,537 DEBUG   HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:58:54,537 DEBUG   SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:59:00,888 INFO    Thread-8  :329334 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/output.log
+2021-07-13 00:59:09,682 DEBUG   HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:59:09,683 DEBUG   SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:59:22,209 DEBUG   SenderThread:329334 [sender.py:send():179] send: stats
+2021-07-13 00:59:24,837 DEBUG   HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:59:24,837 DEBUG   SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:59:39,971 DEBUG   HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:59:39,971 DEBUG   SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 00:59:52,289 DEBUG   SenderThread:329334 [sender.py:send():179] send: stats
+2021-07-13 00:59:55,105 DEBUG   HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 00:59:55,105 DEBUG   SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 01:00:10,158 WARNING MainThread:329334 [internal.py:wandb_internal():147] Internal process interrupt: 1
+2021-07-13 01:00:10,246 DEBUG   HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
+2021-07-13 01:00:10,246 DEBUG   SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
+2021-07-13 01:00:11,893 WARNING MainThread:329334 [internal.py:wandb_internal():147] Internal process interrupt: 2
+2021-07-13 01:00:11,893 ERROR   MainThread:329334 [internal.py:wandb_internal():150] Internal process interrupted.
+2021-07-13 01:00:12,253 INFO    HandlerThread:329334 [handler.py:finish():638] shutting down handler
+2021-07-13 01:00:12,281 INFO    MainThread:329334 [internal.py:handle_exit():78] Internal process exited

wandb/run-20210713_005751-1wnn0lyf/logs/debug.log ADDED Viewed

	@@ -0,0 +1,28 @@

+2021-07-13 00:57:51,920 INFO    MainThread:327810 [wandb_setup.py:_flush():69] setting env: {}
+2021-07-13 00:57:51,920 INFO    MainThread:327810 [wandb_setup.py:_flush():69] setting login settings: {}
+2021-07-13 00:57:51,920 INFO    MainThread:327810 [wandb_init.py:_log_setup():337] Logging user logs to /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/logs/debug.log
+2021-07-13 00:57:51,920 INFO    MainThread:327810 [wandb_init.py:_log_setup():338] Logging internal logs to /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/logs/debug-internal.log
+2021-07-13 00:57:51,920 INFO    MainThread:327810 [wandb_init.py:init():370] calling init triggers
+2021-07-13 00:57:51,920 INFO    MainThread:327810 [wandb_init.py:init():375] wandb.init called with sweep_config: {}
+config: {}
+2021-07-13 00:57:51,920 INFO    MainThread:327810 [wandb_init.py:init():419] starting backend
+2021-07-13 00:57:51,920 INFO    MainThread:327810 [backend.py:_multiprocessing_setup():70] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2021-07-13 00:57:51,997 INFO    MainThread:327810 [backend.py:ensure_launched():135] starting backend process...
+2021-07-13 00:57:52,047 INFO    MainThread:327810 [backend.py:ensure_launched():139] started backend process with pid: 329334
+2021-07-13 00:57:52,050 INFO    MainThread:327810 [wandb_init.py:init():424] backend started and connected
+2021-07-13 00:57:52,053 INFO    MainThread:327810 [wandb_init.py:init():472] updated telemetry
+2021-07-13 00:57:52,054 INFO    MainThread:327810 [wandb_init.py:init():491] communicating current version
+2021-07-13 00:57:52,686 INFO    MainThread:327810 [wandb_init.py:init():496] got version response
+2021-07-13 00:57:52,686 INFO    MainThread:327810 [wandb_init.py:init():504] communicating run to backend with 30 second timeout
+2021-07-13 00:57:52,861 INFO    MainThread:327810 [wandb_init.py:init():529] starting run threads in backend
+2021-07-13 00:57:54,003 INFO    MainThread:327810 [wandb_run.py:_console_start():1623] atexit reg
+2021-07-13 00:57:54,004 INFO    MainThread:327810 [wandb_run.py:_redirect():1497] redirect: SettingsConsole.REDIRECT
+2021-07-13 00:57:54,004 INFO    MainThread:327810 [wandb_run.py:_redirect():1502] Redirecting console.
+2021-07-13 00:57:54,006 INFO    MainThread:327810 [wandb_run.py:_redirect():1558] Redirects installed.
+2021-07-13 00:57:54,006 INFO    MainThread:327810 [wandb_init.py:init():554] run started, returning control to user process
+2021-07-13 00:57:54,012 INFO    MainThread:327810 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 5000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul13_00-57-01_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 500, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 20000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 92768, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''}
+2021-07-13 00:57:54,014 INFO    MainThread:327810 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
+2021-07-13 00:57:54,016 INFO    MainThread:327810 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 64, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False}
+2021-07-13 01:00:22,944 INFO    MainThread:327810 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 255
+2021-07-13 01:00:22,945 INFO    MainThread:327810 [wandb_run.py:_restore():1565] restore
+2021-07-13 01:00:25,397 INFO    MainThread:327810 [wandb_run.py:_restore():1565] restore

wandb/run-20210713_005751-1wnn0lyf/run-1wnn0lyf.wandb ADDED Viewed

Binary file (3.98 kB). View file