dat commited on
Commit
9bd1fec
1 Parent(s): 4229c91
Files changed (33) hide show
  1. config.json +1 -1
  2. events.out.tfevents.1626137349.t1v-n-f5c06ea1-w-0.323744.3.v2 +3 -0
  3. events.out.tfevents.1626137580.t1v-n-f5c06ea1-w-0.325900.3.v2 +3 -0
  4. events.out.tfevents.1626137871.t1v-n-f5c06ea1-w-0.327810.3.v2 +3 -0
  5. run.sh +3 -2
  6. run_mlm_flax.py +10 -10
  7. wandb/debug-internal.log +1 -1
  8. wandb/debug.log +1 -1
  9. wandb/latest-run +1 -1
  10. wandb/run-20210713_004910-3mu9pog5/files/config.yaml +307 -0
  11. wandb/run-20210713_004910-3mu9pog5/files/output.log +376 -0
  12. wandb/run-20210713_004910-3mu9pog5/files/requirements.txt +92 -0
  13. wandb/run-20210713_004910-3mu9pog5/files/wandb-metadata.json +46 -0
  14. wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json +1 -0
  15. wandb/run-20210713_004910-3mu9pog5/logs/debug-internal.log +166 -0
  16. wandb/run-20210713_004910-3mu9pog5/logs/debug.log +119 -0
  17. wandb/run-20210713_004910-3mu9pog5/run-3mu9pog5.wandb +0 -0
  18. wandb/run-20210713_005301-2ilkub1o/files/config.yaml +307 -0
  19. wandb/run-20210713_005301-2ilkub1o/files/output.log +376 -0
  20. wandb/run-20210713_005301-2ilkub1o/files/requirements.txt +92 -0
  21. wandb/run-20210713_005301-2ilkub1o/files/wandb-metadata.json +46 -0
  22. wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json +1 -0
  23. wandb/run-20210713_005301-2ilkub1o/logs/debug-internal.log +168 -0
  24. wandb/run-20210713_005301-2ilkub1o/logs/debug.log +127 -0
  25. wandb/run-20210713_005301-2ilkub1o/run-2ilkub1o.wandb +0 -0
  26. wandb/run-20210713_005751-1wnn0lyf/files/config.yaml +304 -0
  27. wandb/run-20210713_005751-1wnn0lyf/files/output.log +216 -0
  28. wandb/run-20210713_005751-1wnn0lyf/files/requirements.txt +92 -0
  29. wandb/run-20210713_005751-1wnn0lyf/files/wandb-metadata.json +44 -0
  30. wandb/run-20210713_005751-1wnn0lyf/files/wandb-summary.json +1 -0
  31. wandb/run-20210713_005751-1wnn0lyf/logs/debug-internal.log +61 -0
  32. wandb/run-20210713_005751-1wnn0lyf/logs/debug.log +28 -0
  33. wandb/run-20210713_005751-1wnn0lyf/run-1wnn0lyf.wandb +0 -0
config.json CHANGED
@@ -4,7 +4,7 @@
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "attention_type": "block_sparse",
7
- "block_size": 64,
8
  "bos_token_id": 1,
9
  "eos_token_id": 2,
10
  "gradient_checkpointing": false,
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "attention_type": "block_sparse",
7
+ "block_size": 128,
8
  "bos_token_id": 1,
9
  "eos_token_id": 2,
10
  "gradient_checkpointing": false,
events.out.tfevents.1626137349.t1v-n-f5c06ea1-w-0.323744.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f159e4108512bc68b8363ca06b6026ff0844d045b08ba76516f2764b90277292
3
+ size 40
events.out.tfevents.1626137580.t1v-n-f5c06ea1-w-0.325900.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72867ca0c2d013977242562e1efa683ba957c1b4c3352c0547c72dcd0e611de8
3
+ size 40
events.out.tfevents.1626137871.t1v-n-f5c06ea1-w-0.327810.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4e6248f4d9c467b9b97ff9829c4847d2f568eaf3b4c6b79865519f1e98780a9
3
+ size 40
run.sh CHANGED
@@ -19,12 +19,13 @@ python ./run_mlm_flax.py \
19
  --num_train_epochs="5" \
20
  --preprocessing_num_workers="64" \
21
  --save_steps="20000" \
22
- --adafactor \
23
  --learning_rate="5e-5" \
24
  --per_device_train_batch_size="2" \
25
  --per_device_eval_batch_size="2" \
26
  --save_total_limit="5"\
27
  --dtype="bfloat16" \
 
 
28
  #--resume_from_checkpoint="./"\
29
- #--gradient_accumulation_steps="4" \
30
 
19
  --num_train_epochs="5" \
20
  --preprocessing_num_workers="64" \
21
  --save_steps="20000" \
 
22
  --learning_rate="5e-5" \
23
  --per_device_train_batch_size="2" \
24
  --per_device_eval_batch_size="2" \
25
  --save_total_limit="5"\
26
  --dtype="bfloat16" \
27
+ #--adafactor \
28
+ #--gradient_accumulation_steps="8" \
29
  #--resume_from_checkpoint="./"\
30
+
31
 
run_mlm_flax.py CHANGED
@@ -563,7 +563,7 @@ if __name__ == "__main__":
563
 
564
  # Store some constant
565
  num_epochs = int(training_args.num_train_epochs)
566
- train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() * training_args.gradient_accumulation_steps
567
  eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
568
 
569
  num_train_steps = len(train_dataset) // train_batch_size * num_epochs
@@ -610,9 +610,9 @@ if __name__ == "__main__":
610
  mask=decay_mask_fn,
611
  )
612
 
613
- if training_args.gradient_accumulation_steps > 1:
614
- optimizer = optax.MultiSteps(optimizer, training_args.gradient_accumulation_steps)
615
- grad_accum_steps = training_args.gradient_accumulation_steps
616
 
617
  # Setup train state
618
 
@@ -650,7 +650,7 @@ if __name__ == "__main__":
650
  new_state = state.apply_gradients(grads=grad)
651
 
652
  metrics = jax.lax.pmean(
653
- {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step // grad_accum_steps)}, axis_name="batch"
654
  )
655
 
656
  return new_state, metrics, new_dropout_rng
@@ -696,10 +696,10 @@ if __name__ == "__main__":
696
  # Generate an epoch by shuffling sampling indices from the train dataset
697
  num_train_samples = len(train_dataset)
698
  train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
699
- train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size // grad_accum_steps)
700
 
701
  # Gather the indexes for creating the batch and do a training step
702
- for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1,initial=resume_step // grad_accum_steps )):
703
  samples = [train_dataset[int(idx)] for idx in batch_idx]
704
  model_inputs = data_collator(samples, pad_to_multiple_of=16)
705
 
@@ -713,7 +713,7 @@ if __name__ == "__main__":
713
  if cur_step < resume_step:
714
  continue
715
 
716
- if (cur_step % training_args.logging_steps * grad_accum_steps) == 0 and cur_step > 0:
717
  # Save metrics
718
  train_metric = jax_utils.unreplicate(train_metric)
719
  train_time += time.time() - train_start
@@ -730,7 +730,7 @@ if __name__ == "__main__":
730
 
731
  train_metrics = []
732
 
733
- if cur_step % (training_args.eval_steps * grad_accum_steps) == 0 and cur_step > 0:
734
  # ======================== Evaluating ==============================
735
  num_eval_samples = len(eval_dataset)
736
  eval_samples_idx = jnp.arange(num_eval_samples)
@@ -763,7 +763,7 @@ if __name__ == "__main__":
763
  _metrics = {f"eval_{k}":mb_item(v) for k, v in eval_metrics.items()}
764
  wandb.log({"eval_step":cur_step, **_metrics})
765
 
766
- if (cur_step % training_args.save_steps == 0 * grad_accum_steps) and cur_step > 0:
767
  # save checkpoint after each epoch and push checkpoint to the hub
768
  if jax.process_index() == 0:
769
  params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
563
 
564
  # Store some constant
565
  num_epochs = int(training_args.num_train_epochs)
566
+ train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() #* training_args.gradient_accumulation_steps
567
  eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
568
 
569
  num_train_steps = len(train_dataset) // train_batch_size * num_epochs
610
  mask=decay_mask_fn,
611
  )
612
 
613
+ #if training_args.gradient_accumulation_steps > 1:
614
+ # optimizer = optax.MultiSteps(optimizer, training_args.gradient_accumulation_steps)
615
+ #grad_accum_steps = training_args.gradient_accumulation_steps
616
 
617
  # Setup train state
618
 
650
  new_state = state.apply_gradients(grads=grad)
651
 
652
  metrics = jax.lax.pmean(
653
+ {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step )}, axis_name="batch" #// grad_accum_steps
654
  )
655
 
656
  return new_state, metrics, new_dropout_rng
696
  # Generate an epoch by shuffling sampling indices from the train dataset
697
  num_train_samples = len(train_dataset)
698
  train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
699
+ train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size) #// grad_accum_steps
700
 
701
  # Gather the indexes for creating the batch and do a training step
702
+ for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1,initial=resume_step)): #// grad_accum_steps
703
  samples = [train_dataset[int(idx)] for idx in batch_idx]
704
  model_inputs = data_collator(samples, pad_to_multiple_of=16)
705
 
713
  if cur_step < resume_step:
714
  continue
715
 
716
+ if (cur_step % training_args.logging_steps) == 0 and cur_step > 0: #* grad_accum_steps
717
  # Save metrics
718
  train_metric = jax_utils.unreplicate(train_metric)
719
  train_time += time.time() - train_start
730
 
731
  train_metrics = []
732
 
733
+ if cur_step % (training_args.eval_steps) == 0 and cur_step > 0: #* grad_accum_steps
734
  # ======================== Evaluating ==============================
735
  num_eval_samples = len(eval_dataset)
736
  eval_samples_idx = jnp.arange(num_eval_samples)
763
  _metrics = {f"eval_{k}":mb_item(v) for k, v in eval_metrics.items()}
764
  wandb.log({"eval_step":cur_step, **_metrics})
765
 
766
+ if (cur_step % training_args.save_steps == 0) and cur_step > 0: #* grad_accum_steps
767
  # save checkpoint after each epoch and push checkpoint to the hub
768
  if jax.process_index() == 0:
769
  params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
wandb/debug-internal.log CHANGED
@@ -1 +1 @@
1
- run-20210713_002031-11bfig2u/logs/debug-internal.log
1
+ run-20210713_005751-1wnn0lyf/logs/debug-internal.log
wandb/debug.log CHANGED
@@ -1 +1 @@
1
- run-20210713_002031-11bfig2u/logs/debug.log
1
+ run-20210713_005751-1wnn0lyf/logs/debug.log
wandb/latest-run CHANGED
@@ -1 +1 @@
1
- run-20210713_002031-11bfig2u
1
+ run-20210713_005751-1wnn0lyf
wandb/run-20210713_004910-3mu9pog5/files/config.yaml ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ cli_version: 0.10.33
7
+ framework: huggingface
8
+ huggingface_version: 4.9.0.dev0
9
+ is_jupyter_run: false
10
+ is_kaggle_kernel: false
11
+ python_version: 3.8.10
12
+ t:
13
+ 1:
14
+ - 3
15
+ - 11
16
+ 2:
17
+ - 3
18
+ - 11
19
+ 4: 3.8.10
20
+ 5: 0.10.33
21
+ 6: 4.9.0.dev0
22
+ 8:
23
+ - 5
24
+ adafactor:
25
+ desc: null
26
+ value: true
27
+ adam_beta1:
28
+ desc: null
29
+ value: 0.9
30
+ adam_beta2:
31
+ desc: null
32
+ value: 0.98
33
+ adam_epsilon:
34
+ desc: null
35
+ value: 1.0e-08
36
+ cache_dir:
37
+ desc: null
38
+ value: null
39
+ config_name:
40
+ desc: null
41
+ value: ./
42
+ dataloader_drop_last:
43
+ desc: null
44
+ value: false
45
+ dataloader_num_workers:
46
+ desc: null
47
+ value: 0
48
+ dataloader_pin_memory:
49
+ desc: null
50
+ value: true
51
+ dataset_config_name:
52
+ desc: null
53
+ value: null
54
+ dataset_name:
55
+ desc: null
56
+ value: null
57
+ ddp_find_unused_parameters:
58
+ desc: null
59
+ value: null
60
+ debug:
61
+ desc: null
62
+ value: []
63
+ deepspeed:
64
+ desc: null
65
+ value: null
66
+ disable_tqdm:
67
+ desc: null
68
+ value: false
69
+ do_eval:
70
+ desc: null
71
+ value: false
72
+ do_predict:
73
+ desc: null
74
+ value: false
75
+ do_train:
76
+ desc: null
77
+ value: false
78
+ dtype:
79
+ desc: null
80
+ value: bfloat16
81
+ eval_accumulation_steps:
82
+ desc: null
83
+ value: null
84
+ eval_steps:
85
+ desc: null
86
+ value: 92768
87
+ evaluation_strategy:
88
+ desc: null
89
+ value: IntervalStrategy.NO
90
+ fp16:
91
+ desc: null
92
+ value: false
93
+ fp16_backend:
94
+ desc: null
95
+ value: auto
96
+ fp16_full_eval:
97
+ desc: null
98
+ value: false
99
+ fp16_opt_level:
100
+ desc: null
101
+ value: O1
102
+ gradient_accumulation_steps:
103
+ desc: null
104
+ value: 8
105
+ greater_is_better:
106
+ desc: null
107
+ value: null
108
+ group_by_length:
109
+ desc: null
110
+ value: false
111
+ ignore_data_skip:
112
+ desc: null
113
+ value: false
114
+ label_names:
115
+ desc: null
116
+ value: null
117
+ label_smoothing_factor:
118
+ desc: null
119
+ value: 0.0
120
+ learning_rate:
121
+ desc: null
122
+ value: 5.0e-05
123
+ length_column_name:
124
+ desc: null
125
+ value: length
126
+ line_by_line:
127
+ desc: null
128
+ value: false
129
+ load_best_model_at_end:
130
+ desc: null
131
+ value: false
132
+ local_rank:
133
+ desc: null
134
+ value: -1
135
+ log_level:
136
+ desc: null
137
+ value: -1
138
+ log_level_replica:
139
+ desc: null
140
+ value: -1
141
+ log_on_each_node:
142
+ desc: null
143
+ value: true
144
+ logging_dir:
145
+ desc: null
146
+ value: ./runs/Jul13_00-48-19_t1v-n-f5c06ea1-w-0
147
+ logging_first_step:
148
+ desc: null
149
+ value: false
150
+ logging_steps:
151
+ desc: null
152
+ value: 500
153
+ logging_strategy:
154
+ desc: null
155
+ value: IntervalStrategy.STEPS
156
+ lr_scheduler_type:
157
+ desc: null
158
+ value: SchedulerType.LINEAR
159
+ max_grad_norm:
160
+ desc: null
161
+ value: 1.0
162
+ max_seq_length:
163
+ desc: null
164
+ value: 4096
165
+ max_steps:
166
+ desc: null
167
+ value: -1
168
+ metric_for_best_model:
169
+ desc: null
170
+ value: null
171
+ mlm_probability:
172
+ desc: null
173
+ value: 0.15
174
+ model_name_or_path:
175
+ desc: null
176
+ value: null
177
+ model_type:
178
+ desc: null
179
+ value: big_bird
180
+ mp_parameters:
181
+ desc: null
182
+ value: ''
183
+ no_cuda:
184
+ desc: null
185
+ value: false
186
+ num_train_epochs:
187
+ desc: null
188
+ value: 5.0
189
+ output_dir:
190
+ desc: null
191
+ value: ./
192
+ overwrite_cache:
193
+ desc: null
194
+ value: false
195
+ overwrite_output_dir:
196
+ desc: null
197
+ value: true
198
+ pad_to_max_length:
199
+ desc: null
200
+ value: false
201
+ past_index:
202
+ desc: null
203
+ value: -1
204
+ per_device_eval_batch_size:
205
+ desc: null
206
+ value: 4
207
+ per_device_train_batch_size:
208
+ desc: null
209
+ value: 4
210
+ per_gpu_eval_batch_size:
211
+ desc: null
212
+ value: null
213
+ per_gpu_train_batch_size:
214
+ desc: null
215
+ value: null
216
+ prediction_loss_only:
217
+ desc: null
218
+ value: false
219
+ preprocessing_num_workers:
220
+ desc: null
221
+ value: 64
222
+ push_to_hub:
223
+ desc: null
224
+ value: true
225
+ push_to_hub_model_id:
226
+ desc: null
227
+ value: ''
228
+ push_to_hub_organization:
229
+ desc: null
230
+ value: null
231
+ push_to_hub_token:
232
+ desc: null
233
+ value: null
234
+ remove_unused_columns:
235
+ desc: null
236
+ value: true
237
+ report_to:
238
+ desc: null
239
+ value:
240
+ - tensorboard
241
+ - wandb
242
+ resume_from_checkpoint:
243
+ desc: null
244
+ value: null
245
+ run_name:
246
+ desc: null
247
+ value: ./
248
+ save_on_each_node:
249
+ desc: null
250
+ value: false
251
+ save_steps:
252
+ desc: null
253
+ value: 20000
254
+ save_strategy:
255
+ desc: null
256
+ value: IntervalStrategy.STEPS
257
+ save_total_limit:
258
+ desc: null
259
+ value: 5
260
+ seed:
261
+ desc: null
262
+ value: 42
263
+ sharded_ddp:
264
+ desc: null
265
+ value: []
266
+ skip_memory_metrics:
267
+ desc: null
268
+ value: true
269
+ tokenizer_name:
270
+ desc: null
271
+ value: ./
272
+ tpu_metrics_debug:
273
+ desc: null
274
+ value: false
275
+ tpu_num_cores:
276
+ desc: null
277
+ value: null
278
+ train_file:
279
+ desc: null
280
+ value: null
281
+ train_ref_file:
282
+ desc: null
283
+ value: null
284
+ use_fast_tokenizer:
285
+ desc: null
286
+ value: true
287
+ use_legacy_prediction_loop:
288
+ desc: null
289
+ value: false
290
+ validation_file:
291
+ desc: null
292
+ value: null
293
+ validation_ref_file:
294
+ desc: null
295
+ value: null
296
+ validation_split_percentage:
297
+ desc: null
298
+ value: 5
299
+ warmup_ratio:
300
+ desc: null
301
+ value: 0.0
302
+ warmup_steps:
303
+ desc: null
304
+ value: 5000
305
+ weight_decay:
306
+ desc: null
307
+ value: 0.0095
wandb/run-20210713_004910-3mu9pog5/files/output.log ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /home/dat/pino/lib/python3.8/site-packages/jax/_src/numpy/lax_numpy.py:3114: UserWarning: Explicitly requested dtype <class 'jax._src.numpy.lax_numpy.int64'> requested in zeros is not available, and will be truncated to dtype int32. To enable more dtypes, set the jax_enable_x64 configuration option or the JAX_ENABLE_X64 shell environment variable. See https://github.com/google/jax#current-gotchas for more.
2
+ lax._check_user_dtype_supported(dtype, "zeros")
3
+ /home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:382: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
4
+ warnings.warn(
5
+ /home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:369: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.
6
+ warnings.warn(
7
+ Epoch ... (1/5): 0%| | 0/5 [00:00<?, ?it/s]
8
+ Epoch ... (1/5): 0%| | 0/5 [02:22<?, ?it/s]
9
+ Traceback (most recent call last):
10
+ File "./run_mlm_flax.py", line 709, in <module>
11
+ state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
12
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
13
+ return fun(*args, **kwargs)
14
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/api.py", line 1647, in f_pmapped
15
+ out = pxla.xla_pmap(
16
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1620, in bind
17
+ return call_bind(self, fun, *args, **params)
18
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1551, in call_bind
19
+ outs = primitive.process(top_trace, fun, tracers, params)
20
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1623, in process
21
+ return trace.process_map(self, fun, tracers, params)
22
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 606, in process_call
23
+ return primitive.impl(f, *tracers, **params)
24
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 624, in xla_pmap_impl
25
+ compiled_fun, fingerprint = parallel_callable(fun, backend, axis_name, axis_size,
26
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/linear_util.py", line 262, in memoized_fun
27
+ ans = call(fun, *args)
28
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 899, in parallel_callable
29
+ compiled = xla.backend_compile(backend, built, compile_options)
30
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
31
+ return backend.compile(built_c, compile_options=options)
32
+ jax._src.traceback_util.UnfilteredStackTrace: RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.61G of 15.48G hbm. Exceeded hbm capacity by 5.13G.
33
+ Total hbm usage >= 21.13G:
34
+ reserved 530.00M
35
+ program 20.61G
36
+ arguments 0B
37
+ Output size 0B; shares 0B with arguments.
38
+ Program hbm requirement 20.61G:
39
+ global 900.0K
40
+ scoped 924.0K
41
+ HLO temp 20.61G (63.0% utilization: Unpadded (12.43G) Padded (19.71G), 4.4% fragmentation (918.84M))
42
+ Largest program allocations in hbm:
43
+ 1. Size: 1.54G
44
+ Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
45
+ Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
46
+ Unpadded size: 1.54G
47
+ Extra memory due to padding: 64.0K (1.0x expansion)
48
+ XLA label: %fusion.3615.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %get-tuple-element.22628, f32[768]{0:T(1024)} %fusion.10158, f32[768]{0:T(1024)} %fusion.10159, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.20129, f32[...
49
+ Allocation type: HLO temp
50
+ ==========================
51
+ 2. Size: 360.00M
52
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
53
+ Unpadded size: 180.00M
54
+ Extra memory due to padding: 180.00M (2.0x expansion)
55
+ XLA label: %fusion.2444.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2444.remat_compressed)
56
+ Allocation type: HLO temp
57
+ ==========================
58
+ 3. Size: 360.00M
59
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
60
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
61
+ Unpadded size: 180.00M
62
+ Extra memory due to padding: 180.00M (2.0x expansion)
63
+ XLA label: %fusion.2454.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2804, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7916, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
64
+ Allocation type: HLO temp
65
+ ==========================
66
+ 4. Size: 360.00M
67
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
68
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
69
+ Unpadded size: 180.00M
70
+ Extra memory due to padding: 180.00M (2.0x expansion)
71
+ XLA label: %fusion.2453.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2803, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7915, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
72
+ Allocation type: HLO temp
73
+ ==========================
74
+ 5. Size: 360.00M
75
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
76
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
77
+ Unpadded size: 180.00M
78
+ Extra memory due to padding: 180.00M (2.0x expansion)
79
+ XLA label: %fusion.2452.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2802, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7914, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
80
+ Allocation type: HLO temp
81
+ ==========================
82
+ 6. Size: 360.00M
83
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
84
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
85
+ Unpadded size: 180.00M
86
+ Extra memory due to padding: 180.00M (2.0x expansion)
87
+ XLA label: %fusion.2451.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2801, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7913, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
88
+ Allocation type: HLO temp
89
+ ==========================
90
+ 7. Size: 360.00M
91
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
92
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
93
+ Unpadded size: 180.00M
94
+ Extra memory due to padding: 180.00M (2.0x expansion)
95
+ XLA label: %fusion.2445 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2795, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7907, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.20342, f32[4,12,60,64,192]{3,4,2,1...
96
+ Allocation type: HLO temp
97
+ ==========================
98
+ 8. Size: 360.00M
99
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
100
+ Unpadded size: 180.00M
101
+ Extra memory due to padding: 180.00M (2.0x expansion)
102
+ XLA label: %fusion.2443.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2443.remat_compressed)
103
+ Allocation type: HLO temp
104
+ ==========================
105
+ 9. Size: 360.00M
106
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
107
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
108
+ Unpadded size: 180.00M
109
+ Extra memory due to padding: 180.00M (2.0x expansion)
110
+ XLA label: %fusion.2450.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2800, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7912, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
111
+ Allocation type: HLO temp
112
+ ==========================
113
+ 10. Size: 360.00M
114
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
115
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
116
+ Unpadded size: 180.00M
117
+ Extra memory due to padding: 180.00M (2.0x expansion)
118
+ XLA label: %fusion.2449.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2799, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7911, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
119
+ Allocation type: HLO temp
120
+ ==========================
121
+ 11. Size: 360.00M
122
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
123
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
124
+ Unpadded size: 180.00M
125
+ Extra memory due to padding: 180.00M (2.0x expansion)
126
+ XLA label: %fusion.2448.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2798, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7910, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
127
+ Allocation type: HLO temp
128
+ ==========================
129
+ 12. Size: 360.00M
130
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
131
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
132
+ Unpadded size: 180.00M
133
+ Extra memory due to padding: 180.00M (2.0x expansion)
134
+ XLA label: %fusion.2447.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2797, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7909, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
135
+ Allocation type: HLO temp
136
+ ==========================
137
+ 13. Size: 360.00M
138
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
139
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
140
+ Unpadded size: 180.00M
141
+ Extra memory due to padding: 180.00M (2.0x expansion)
142
+ XLA label: %fusion.2446.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2796, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7908, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
143
+ Allocation type: HLO temp
144
+ ==========================
145
+ 14. Size: 270.00M
146
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
147
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
148
+ Unpadded size: 135.00M
149
+ Extra memory due to padding: 135.00M (2.0x expansion)
150
+ XLA label: %fusion.2689.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2964), kind=kO...
151
+ Allocation type: HLO temp
152
+ ==========================
153
+ 15. Size: 270.00M
154
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
155
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
156
+ Unpadded size: 135.00M
157
+ Extra memory due to padding: 135.00M (2.0x expansion)
158
+ XLA label: %fusion.2690.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2962), kind=kO...
159
+ Allocation type: HLO temp
160
+ ==========================
161
+ 16. Size: 270.00M
162
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
163
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
164
+ Unpadded size: 135.00M
165
+ Extra memory due to padding: 135.00M (2.0x expansion)
166
+ XLA label: %fusion.2688.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2966), kind=kO...
167
+ Allocation type: HLO temp
168
+ ==========================
169
+ 17. Size: 270.00M
170
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
171
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
172
+ Unpadded size: 135.00M
173
+ Extra memory due to padding: 135.00M (2.0x expansion)
174
+ XLA label: %fusion.2691.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14230, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2960), kind=kO...
175
+ Allocation type: HLO temp
176
+ ==========================
177
+ 18. Size: 270.00M
178
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
179
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
180
+ Unpadded size: 135.00M
181
+ Extra memory due to padding: 135.00M (2.0x expansion)
182
+ XLA label: %fusion.2692.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14164, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2958), kind=kO...
183
+ Allocation type: HLO temp
184
+ ==========================
185
+ 19. Size: 270.00M
186
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
187
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
188
+ Unpadded size: 135.00M
189
+ Extra memory due to padding: 135.00M (2.0x expansion)
190
+ XLA label: %fusion.2693.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14098, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2956), kind=kO...
191
+ Allocation type: HLO temp
192
+ ==========================
193
+ 20. Size: 270.00M
194
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
195
+ Unpadded size: 135.00M
196
+ Extra memory due to padding: 135.00M (2.0x expansion)
197
+ XLA label: %fusion.2616.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.2616.remat_compressed)
198
+ Allocation type: HLO temp
199
+ ==========================
200
+ The stack trace below excludes JAX-internal frames.
201
+ The preceding is the original exception that occurred, unmodified.
202
+ --------------------
203
+ The above exception was the direct cause of the following exception:
204
+ Traceback (most recent call last):
205
+ File "./run_mlm_flax.py", line 709, in <module>
206
+ state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
207
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
208
+ return backend.compile(built_c, compile_options=options)
209
+ RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.61G of 15.48G hbm. Exceeded hbm capacity by 5.13G.
210
+ Total hbm usage >= 21.13G:
211
+ reserved 530.00M
212
+ program 20.61G
213
+ arguments 0B
214
+ Output size 0B; shares 0B with arguments.
215
+ Program hbm requirement 20.61G:
216
+ global 900.0K
217
+ scoped 924.0K
218
+ HLO temp 20.61G (63.0% utilization: Unpadded (12.43G) Padded (19.71G), 4.4% fragmentation (918.84M))
219
+ Largest program allocations in hbm:
220
+ 1. Size: 1.54G
221
+ Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
222
+ Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
223
+ Unpadded size: 1.54G
224
+ Extra memory due to padding: 64.0K (1.0x expansion)
225
+ XLA label: %fusion.3615.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %get-tuple-element.22628, f32[768]{0:T(1024)} %fusion.10158, f32[768]{0:T(1024)} %fusion.10159, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.20129, f32[...
226
+ Allocation type: HLO temp
227
+ ==========================
228
+ 2. Size: 360.00M
229
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
230
+ Unpadded size: 180.00M
231
+ Extra memory due to padding: 180.00M (2.0x expansion)
232
+ XLA label: %fusion.2444.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2444.remat_compressed)
233
+ Allocation type: HLO temp
234
+ ==========================
235
+ 3. Size: 360.00M
236
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
237
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
238
+ Unpadded size: 180.00M
239
+ Extra memory due to padding: 180.00M (2.0x expansion)
240
+ XLA label: %fusion.2454.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2804, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7916, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
241
+ Allocation type: HLO temp
242
+ ==========================
243
+ 4. Size: 360.00M
244
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
245
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
246
+ Unpadded size: 180.00M
247
+ Extra memory due to padding: 180.00M (2.0x expansion)
248
+ XLA label: %fusion.2453.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2803, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7915, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
249
+ Allocation type: HLO temp
250
+ ==========================
251
+ 5. Size: 360.00M
252
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
253
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
254
+ Unpadded size: 180.00M
255
+ Extra memory due to padding: 180.00M (2.0x expansion)
256
+ XLA label: %fusion.2452.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2802, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7914, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
257
+ Allocation type: HLO temp
258
+ ==========================
259
+ 6. Size: 360.00M
260
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
261
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
262
+ Unpadded size: 180.00M
263
+ Extra memory due to padding: 180.00M (2.0x expansion)
264
+ XLA label: %fusion.2451.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2801, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7913, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
265
+ Allocation type: HLO temp
266
+ ==========================
267
+ 7. Size: 360.00M
268
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
269
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
270
+ Unpadded size: 180.00M
271
+ Extra memory due to padding: 180.00M (2.0x expansion)
272
+ XLA label: %fusion.2445 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2795, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7907, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.20342, f32[4,12,60,64,192]{3,4,2,1...
273
+ Allocation type: HLO temp
274
+ ==========================
275
+ 8. Size: 360.00M
276
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
277
+ Unpadded size: 180.00M
278
+ Extra memory due to padding: 180.00M (2.0x expansion)
279
+ XLA label: %fusion.2443.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2443.remat_compressed)
280
+ Allocation type: HLO temp
281
+ ==========================
282
+ 9. Size: 360.00M
283
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
284
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
285
+ Unpadded size: 180.00M
286
+ Extra memory due to padding: 180.00M (2.0x expansion)
287
+ XLA label: %fusion.2450.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2800, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7912, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
288
+ Allocation type: HLO temp
289
+ ==========================
290
+ 10. Size: 360.00M
291
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
292
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
293
+ Unpadded size: 180.00M
294
+ Extra memory due to padding: 180.00M (2.0x expansion)
295
+ XLA label: %fusion.2449.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2799, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7911, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
296
+ Allocation type: HLO temp
297
+ ==========================
298
+ 11. Size: 360.00M
299
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
300
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
301
+ Unpadded size: 180.00M
302
+ Extra memory due to padding: 180.00M (2.0x expansion)
303
+ XLA label: %fusion.2448.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2798, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7910, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
304
+ Allocation type: HLO temp
305
+ ==========================
306
+ 12. Size: 360.00M
307
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
308
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
309
+ Unpadded size: 180.00M
310
+ Extra memory due to padding: 180.00M (2.0x expansion)
311
+ XLA label: %fusion.2447.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2797, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7909, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
312
+ Allocation type: HLO temp
313
+ ==========================
314
+ 13. Size: 360.00M
315
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
316
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
317
+ Unpadded size: 180.00M
318
+ Extra memory due to padding: 180.00M (2.0x expansion)
319
+ XLA label: %fusion.2446.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2796, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7908, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
320
+ Allocation type: HLO temp
321
+ ==========================
322
+ 14. Size: 270.00M
323
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
324
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
325
+ Unpadded size: 135.00M
326
+ Extra memory due to padding: 135.00M (2.0x expansion)
327
+ XLA label: %fusion.2689.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2964), kind=kO...
328
+ Allocation type: HLO temp
329
+ ==========================
330
+ 15. Size: 270.00M
331
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
332
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
333
+ Unpadded size: 135.00M
334
+ Extra memory due to padding: 135.00M (2.0x expansion)
335
+ XLA label: %fusion.2690.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2962), kind=kO...
336
+ Allocation type: HLO temp
337
+ ==========================
338
+ 16. Size: 270.00M
339
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
340
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
341
+ Unpadded size: 135.00M
342
+ Extra memory due to padding: 135.00M (2.0x expansion)
343
+ XLA label: %fusion.2688.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2966), kind=kO...
344
+ Allocation type: HLO temp
345
+ ==========================
346
+ 17. Size: 270.00M
347
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
348
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
349
+ Unpadded size: 135.00M
350
+ Extra memory due to padding: 135.00M (2.0x expansion)
351
+ XLA label: %fusion.2691.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14230, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2960), kind=kO...
352
+ Allocation type: HLO temp
353
+ ==========================
354
+ 18. Size: 270.00M
355
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
356
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
357
+ Unpadded size: 135.00M
358
+ Extra memory due to padding: 135.00M (2.0x expansion)
359
+ XLA label: %fusion.2692.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14164, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2958), kind=kO...
360
+ Allocation type: HLO temp
361
+ ==========================
362
+ 19. Size: 270.00M
363
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
364
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
365
+ Unpadded size: 135.00M
366
+ Extra memory due to padding: 135.00M (2.0x expansion)
367
+ XLA label: %fusion.2693.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14098, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2956), kind=kO...
368
+ Allocation type: HLO temp
369
+ ==========================
370
+ 20. Size: 270.00M
371
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
372
+ Unpadded size: 135.00M
373
+ Extra memory due to padding: 135.00M (2.0x expansion)
374
+ XLA label: %fusion.2616.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.2616.remat_compressed)
375
+ Allocation type: HLO temp
376
+ ==========================
wandb/run-20210713_004910-3mu9pog5/files/requirements.txt ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==0.13.0
2
+ aiohttp==3.7.4.post0
3
+ astunparse==1.6.3
4
+ async-timeout==3.0.1
5
+ attrs==21.2.0
6
+ cachetools==4.2.2
7
+ certifi==2021.5.30
8
+ chardet==4.0.0
9
+ chex==0.0.8
10
+ click==8.0.1
11
+ configparser==5.0.2
12
+ cycler==0.10.0
13
+ datasets==1.9.1.dev0
14
+ dill==0.3.4
15
+ dm-tree==0.1.6
16
+ docker-pycreds==0.4.0
17
+ filelock==3.0.12
18
+ flatbuffers==1.12
19
+ flax==0.3.4
20
+ fsspec==2021.6.1
21
+ gast==0.4.0
22
+ gitdb==4.0.7
23
+ gitpython==3.1.18
24
+ google-auth-oauthlib==0.4.4
25
+ google-auth==1.32.1
26
+ google-pasta==0.2.0
27
+ grpcio==1.34.1
28
+ h5py==3.1.0
29
+ huggingface-hub==0.0.12
30
+ idna==2.10
31
+ jax==0.2.16
32
+ jaxlib==0.1.68
33
+ joblib==1.0.1
34
+ keras-nightly==2.5.0.dev2021032900
35
+ keras-preprocessing==1.1.2
36
+ kiwisolver==1.3.1
37
+ libtpu-nightly==0.1.dev20210615
38
+ markdown==3.3.4
39
+ matplotlib==3.4.2
40
+ msgpack==1.0.2
41
+ multidict==5.1.0
42
+ multiprocess==0.70.12.2
43
+ numpy==1.19.5
44
+ oauthlib==3.1.1
45
+ opt-einsum==3.3.0
46
+ optax==0.0.9
47
+ packaging==21.0
48
+ pandas==1.3.0
49
+ pathtools==0.1.2
50
+ pillow==8.3.1
51
+ pip==20.0.2
52
+ pkg-resources==0.0.0
53
+ promise==2.3
54
+ protobuf==3.17.3
55
+ psutil==5.8.0
56
+ pyarrow==4.0.1
57
+ pyasn1-modules==0.2.8
58
+ pyasn1==0.4.8
59
+ pyparsing==2.4.7
60
+ python-dateutil==2.8.1
61
+ pytz==2021.1
62
+ pyyaml==5.4.1
63
+ regex==2021.7.6
64
+ requests-oauthlib==1.3.0
65
+ requests==2.25.1
66
+ rsa==4.7.2
67
+ sacremoses==0.0.45
68
+ scipy==1.7.0
69
+ sentry-sdk==1.3.0
70
+ setuptools==44.0.0
71
+ shortuuid==1.0.1
72
+ six==1.15.0
73
+ smmap==4.0.0
74
+ subprocess32==3.5.4
75
+ tensorboard-data-server==0.6.1
76
+ tensorboard-plugin-wit==1.8.0
77
+ tensorboard==2.5.0
78
+ tensorflow-estimator==2.5.0
79
+ tensorflow==2.5.0
80
+ termcolor==1.1.0
81
+ tokenizers==0.10.3
82
+ toolz==0.11.1
83
+ tqdm==4.61.2
84
+ transformers==4.9.0.dev0
85
+ typing-extensions==3.7.4.3
86
+ urllib3==1.26.6
87
+ wandb==0.10.33
88
+ werkzeug==2.0.1
89
+ wheel==0.36.2
90
+ wrapt==1.12.1
91
+ xxhash==2.0.2
92
+ yarl==1.6.3
wandb/run-20210713_004910-3mu9pog5/files/wandb-metadata.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
3
+ "python": "3.8.10",
4
+ "heartbeatAt": "2021-07-13T00:49:12.868844",
5
+ "startedAt": "2021-07-13T00:49:10.806043",
6
+ "docker": null,
7
+ "cpu_count": 96,
8
+ "cuda": null,
9
+ "args": [
10
+ "--push_to_hub",
11
+ "--output_dir=./",
12
+ "--model_type=big_bird",
13
+ "--config_name=./",
14
+ "--tokenizer_name=./",
15
+ "--max_seq_length=4096",
16
+ "--weight_decay=0.0095",
17
+ "--warmup_steps=5000",
18
+ "--overwrite_output_dir",
19
+ "--adam_beta1=0.9",
20
+ "--adam_beta2=0.98",
21
+ "--logging_steps=500",
22
+ "--eval_steps=92768",
23
+ "--num_train_epochs=5",
24
+ "--preprocessing_num_workers=64",
25
+ "--save_steps=20000",
26
+ "--adafactor",
27
+ "--learning_rate=5e-5",
28
+ "--per_device_train_batch_size=4",
29
+ "--per_device_eval_batch_size=4",
30
+ "--save_total_limit=5",
31
+ "--dtype=bfloat16",
32
+ "--gradient_accumulation_steps=8"
33
+ ],
34
+ "state": "running",
35
+ "program": "./run_mlm_flax.py",
36
+ "codePath": "run_mlm_flax.py",
37
+ "git": {
38
+ "remote": "https://huggingface.co/flax-community/pino-roberta-base",
39
+ "commit": "4229c91b780cf07115cc6d04c16e393b0d2f508c"
40
+ },
41
+ "email": null,
42
+ "root": "/home/dat/pino-roberta-base",
43
+ "host": "t1v-n-f5c06ea1-w-0",
44
+ "username": "dat",
45
+ "executable": "/home/dat/pino/bin/python"
46
+ }
wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
1
+ {}
wandb/run-20210713_004910-3mu9pog5/logs/debug-internal.log ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2021-07-13 00:49:11,524 INFO MainThread:325318 [internal.py:wandb_internal():88] W&B internal server running at pid: 325318, started at: 2021-07-13 00:49:11.523864
2
+ 2021-07-13 00:49:11,526 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: check_version
3
+ 2021-07-13 00:49:11,526 INFO WriterThread:325318 [datastore.py:open_for_write():80] open: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/run-3mu9pog5.wandb
4
+ 2021-07-13 00:49:11,527 DEBUG SenderThread:325318 [sender.py:send():179] send: header
5
+ 2021-07-13 00:49:11,527 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: check_version
6
+ 2021-07-13 00:49:11,564 DEBUG SenderThread:325318 [sender.py:send():179] send: run
7
+ 2021-07-13 00:49:11,738 INFO SenderThread:325318 [dir_watcher.py:__init__():168] watching files in: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files
8
+ 2021-07-13 00:49:11,739 INFO SenderThread:325318 [sender.py:_start_run_threads():716] run started: 3mu9pog5 with start time 1626137350
9
+ 2021-07-13 00:49:11,739 DEBUG SenderThread:325318 [sender.py:send():179] send: summary
10
+ 2021-07-13 00:49:11,739 INFO SenderThread:325318 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
11
+ 2021-07-13 00:49:11,739 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: run_start
12
+ 2021-07-13 00:49:12,741 INFO Thread-8 :325318 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json
13
+ 2021-07-13 00:49:12,868 DEBUG HandlerThread:325318 [meta.py:__init__():39] meta init
14
+ 2021-07-13 00:49:12,868 DEBUG HandlerThread:325318 [meta.py:__init__():53] meta init done
15
+ 2021-07-13 00:49:12,868 DEBUG HandlerThread:325318 [meta.py:probe():210] probe
16
+ 2021-07-13 00:49:12,870 DEBUG HandlerThread:325318 [meta.py:_setup_git():200] setup git
17
+ 2021-07-13 00:49:12,899 DEBUG HandlerThread:325318 [meta.py:_setup_git():207] setup git done
18
+ 2021-07-13 00:49:12,899 DEBUG HandlerThread:325318 [meta.py:_save_pip():57] save pip
19
+ 2021-07-13 00:49:12,899 DEBUG HandlerThread:325318 [meta.py:_save_pip():71] save pip done
20
+ 2021-07-13 00:49:12,899 DEBUG HandlerThread:325318 [meta.py:probe():252] probe done
21
+ 2021-07-13 00:49:12,903 DEBUG SenderThread:325318 [sender.py:send():179] send: files
22
+ 2021-07-13 00:49:12,903 INFO SenderThread:325318 [sender.py:_save_file():841] saving file wandb-metadata.json with policy now
23
+ 2021-07-13 00:49:12,910 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
24
+ 2021-07-13 00:49:12,911 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
25
+ 2021-07-13 00:49:13,042 DEBUG SenderThread:325318 [sender.py:send():179] send: config
26
+ 2021-07-13 00:49:13,043 DEBUG SenderThread:325318 [sender.py:send():179] send: config
27
+ 2021-07-13 00:49:13,043 DEBUG SenderThread:325318 [sender.py:send():179] send: config
28
+ 2021-07-13 00:49:13,348 INFO Thread-11 :325318 [upload_job.py:push():137] Uploaded file /tmp/tmpkvnk9e30wandb/65yetzns-wandb-metadata.json
29
+ 2021-07-13 00:49:13,741 INFO Thread-8 :325318 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
30
+ 2021-07-13 00:49:13,741 INFO Thread-8 :325318 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/requirements.txt
31
+ 2021-07-13 00:49:13,741 INFO Thread-8 :325318 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-metadata.json
32
+ 2021-07-13 00:49:28,044 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
33
+ 2021-07-13 00:49:28,044 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
34
+ 2021-07-13 00:49:29,748 INFO Thread-8 :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
35
+ 2021-07-13 00:49:31,749 INFO Thread-8 :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
36
+ 2021-07-13 00:49:40,952 DEBUG SenderThread:325318 [sender.py:send():179] send: stats
37
+ 2021-07-13 00:49:42,754 INFO Thread-8 :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/config.yaml
38
+ 2021-07-13 00:49:43,176 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
39
+ 2021-07-13 00:49:43,177 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
40
+ 2021-07-13 00:49:58,307 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
41
+ 2021-07-13 00:49:58,307 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
42
+ 2021-07-13 00:50:11,029 DEBUG SenderThread:325318 [sender.py:send():179] send: stats
43
+ 2021-07-13 00:50:13,441 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
44
+ 2021-07-13 00:50:13,442 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
45
+ 2021-07-13 00:50:21,769 INFO Thread-8 :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
46
+ 2021-07-13 00:50:28,590 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
47
+ 2021-07-13 00:50:28,590 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
48
+ 2021-07-13 00:50:41,106 DEBUG SenderThread:325318 [sender.py:send():179] send: stats
49
+ 2021-07-13 00:50:43,758 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
50
+ 2021-07-13 00:50:43,759 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
51
+ 2021-07-13 00:50:58,908 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
52
+ 2021-07-13 00:50:58,909 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
53
+ 2021-07-13 00:51:11,187 DEBUG SenderThread:325318 [sender.py:send():179] send: stats
54
+ 2021-07-13 00:51:14,040 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
55
+ 2021-07-13 00:51:14,041 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
56
+ 2021-07-13 00:51:29,172 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
57
+ 2021-07-13 00:51:29,173 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
58
+ 2021-07-13 00:51:41,267 DEBUG SenderThread:325318 [sender.py:send():179] send: stats
59
+ 2021-07-13 00:51:44,303 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
60
+ 2021-07-13 00:51:44,304 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
61
+ 2021-07-13 00:51:53,809 INFO Thread-8 :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
62
+ 2021-07-13 00:51:54,323 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
63
+ 2021-07-13 00:51:54,323 DEBUG SenderThread:325318 [sender.py:send():179] send: telemetry
64
+ 2021-07-13 00:51:54,323 DEBUG SenderThread:325318 [sender.py:send():179] send: exit
65
+ 2021-07-13 00:51:54,323 INFO SenderThread:325318 [sender.py:send_exit():287] handling exit code: 1
66
+ 2021-07-13 00:51:54,323 INFO SenderThread:325318 [sender.py:send_exit():295] send defer
67
+ 2021-07-13 00:51:54,323 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
68
+ 2021-07-13 00:51:54,324 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
69
+ 2021-07-13 00:51:54,324 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 0
70
+ 2021-07-13 00:51:54,324 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
71
+ 2021-07-13 00:51:54,324 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 0
72
+ 2021-07-13 00:51:54,324 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 1
73
+ 2021-07-13 00:51:54,325 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
74
+ 2021-07-13 00:51:54,325 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 1
75
+ 2021-07-13 00:51:54,400 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
76
+ 2021-07-13 00:51:54,400 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 1
77
+ 2021-07-13 00:51:54,400 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 2
78
+ 2021-07-13 00:51:54,401 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
79
+ 2021-07-13 00:51:54,401 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 2
80
+ 2021-07-13 00:51:54,401 DEBUG SenderThread:325318 [sender.py:send():179] send: stats
81
+ 2021-07-13 00:51:54,401 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
82
+ 2021-07-13 00:51:54,401 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 2
83
+ 2021-07-13 00:51:54,401 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 3
84
+ 2021-07-13 00:51:54,402 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
85
+ 2021-07-13 00:51:54,402 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 3
86
+ 2021-07-13 00:51:54,402 DEBUG SenderThread:325318 [sender.py:send():179] send: summary
87
+ 2021-07-13 00:51:54,402 INFO SenderThread:325318 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
88
+ 2021-07-13 00:51:54,403 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
89
+ 2021-07-13 00:51:54,403 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 3
90
+ 2021-07-13 00:51:54,403 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 4
91
+ 2021-07-13 00:51:54,403 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
92
+ 2021-07-13 00:51:54,403 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 4
93
+ 2021-07-13 00:51:54,403 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
94
+ 2021-07-13 00:51:54,403 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 4
95
+ 2021-07-13 00:51:54,426 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
96
+ 2021-07-13 00:51:54,590 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 5
97
+ 2021-07-13 00:51:54,590 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
98
+ 2021-07-13 00:51:54,591 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
99
+ 2021-07-13 00:51:54,591 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 5
100
+ 2021-07-13 00:51:54,591 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
101
+ 2021-07-13 00:51:54,591 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 5
102
+ 2021-07-13 00:51:54,591 INFO SenderThread:325318 [dir_watcher.py:finish():282] shutting down directory watcher
103
+ 2021-07-13 00:51:54,693 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
104
+ 2021-07-13 00:51:54,809 INFO Thread-8 :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
105
+ 2021-07-13 00:51:54,810 INFO SenderThread:325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/config.yaml
106
+ 2021-07-13 00:51:54,810 INFO SenderThread:325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json
107
+ 2021-07-13 00:51:54,810 INFO SenderThread:325318 [dir_watcher.py:finish():312] scan: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files
108
+ 2021-07-13 00:51:54,810 INFO SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/requirements.txt requirements.txt
109
+ 2021-07-13 00:51:54,810 INFO SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log output.log
110
+ 2021-07-13 00:51:54,811 INFO SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-metadata.json wandb-metadata.json
111
+ 2021-07-13 00:51:54,811 INFO SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/config.yaml config.yaml
112
+ 2021-07-13 00:51:54,811 INFO SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json wandb-summary.json
113
+ 2021-07-13 00:51:54,811 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 6
114
+ 2021-07-13 00:51:54,811 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
115
+ 2021-07-13 00:51:54,812 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
116
+ 2021-07-13 00:51:54,812 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 6
117
+ 2021-07-13 00:51:54,812 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
118
+ 2021-07-13 00:51:54,814 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 6
119
+ 2021-07-13 00:51:54,814 INFO SenderThread:325318 [file_pusher.py:finish():177] shutting down file pusher
120
+ 2021-07-13 00:51:54,913 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
121
+ 2021-07-13 00:51:54,914 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
122
+ 2021-07-13 00:51:55,016 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
123
+ 2021-07-13 00:51:55,016 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
124
+ 2021-07-13 00:51:55,118 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
125
+ 2021-07-13 00:51:55,118 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
126
+ 2021-07-13 00:51:55,220 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
127
+ 2021-07-13 00:51:55,220 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
128
+ 2021-07-13 00:51:55,257 INFO Thread-14 :325318 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/config.yaml
129
+ 2021-07-13 00:51:55,266 INFO Thread-12 :325318 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/requirements.txt
130
+ 2021-07-13 00:51:55,277 INFO Thread-13 :325318 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
131
+ 2021-07-13 00:51:55,288 INFO Thread-15 :325318 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json
132
+ 2021-07-13 00:51:55,322 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
133
+ 2021-07-13 00:51:55,322 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
134
+ 2021-07-13 00:51:55,424 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
135
+ 2021-07-13 00:51:55,425 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
136
+ 2021-07-13 00:51:55,489 INFO Thread-7 :325318 [sender.py:transition_state():308] send defer: 7
137
+ 2021-07-13 00:51:55,489 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
138
+ 2021-07-13 00:51:55,489 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 7
139
+ 2021-07-13 00:51:55,489 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
140
+ 2021-07-13 00:51:55,490 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 7
141
+ 2021-07-13 00:51:55,526 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
142
+ 2021-07-13 00:51:55,771 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 8
143
+ 2021-07-13 00:51:55,772 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
144
+ 2021-07-13 00:51:55,772 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
145
+ 2021-07-13 00:51:55,772 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 8
146
+ 2021-07-13 00:51:55,772 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
147
+ 2021-07-13 00:51:55,772 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 8
148
+ 2021-07-13 00:51:55,773 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 9
149
+ 2021-07-13 00:51:55,773 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
150
+ 2021-07-13 00:51:55,773 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 9
151
+ 2021-07-13 00:51:55,773 DEBUG SenderThread:325318 [sender.py:send():179] send: final
152
+ 2021-07-13 00:51:55,773 DEBUG SenderThread:325318 [sender.py:send():179] send: footer
153
+ 2021-07-13 00:51:55,773 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
154
+ 2021-07-13 00:51:55,773 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 9
155
+ 2021-07-13 00:51:55,874 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
156
+ 2021-07-13 00:51:55,874 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
157
+ 2021-07-13 00:51:55,874 INFO SenderThread:325318 [file_pusher.py:join():182] waiting for file pusher
158
+ 2021-07-13 00:51:55,876 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: get_summary
159
+ 2021-07-13 00:51:55,877 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: sampled_history
160
+ 2021-07-13 00:51:55,877 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: shutdown
161
+ 2021-07-13 00:51:55,877 INFO HandlerThread:325318 [handler.py:finish():638] shutting down handler
162
+ 2021-07-13 00:51:56,774 INFO WriterThread:325318 [datastore.py:close():288] close: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/run-3mu9pog5.wandb
163
+ 2021-07-13 00:51:56,875 INFO SenderThread:325318 [sender.py:finish():945] shutting down sender
164
+ 2021-07-13 00:51:56,875 INFO SenderThread:325318 [file_pusher.py:finish():177] shutting down file pusher
165
+ 2021-07-13 00:51:56,875 INFO SenderThread:325318 [file_pusher.py:join():182] waiting for file pusher
166
+ 2021-07-13 00:51:56,877 INFO MainThread:325318 [internal.py:handle_exit():78] Internal process exited
wandb/run-20210713_004910-3mu9pog5/logs/debug.log ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2021-07-13 00:49:10,807 INFO MainThread:323744 [wandb_setup.py:_flush():69] setting env: {}
2
+ 2021-07-13 00:49:10,807 INFO MainThread:323744 [wandb_setup.py:_flush():69] setting login settings: {}
3
+ 2021-07-13 00:49:10,807 INFO MainThread:323744 [wandb_init.py:_log_setup():337] Logging user logs to /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/logs/debug.log
4
+ 2021-07-13 00:49:10,807 INFO MainThread:323744 [wandb_init.py:_log_setup():338] Logging internal logs to /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/logs/debug-internal.log
5
+ 2021-07-13 00:49:10,808 INFO MainThread:323744 [wandb_init.py:init():370] calling init triggers
6
+ 2021-07-13 00:49:10,808 INFO MainThread:323744 [wandb_init.py:init():375] wandb.init called with sweep_config: {}
7
+ config: {}
8
+ 2021-07-13 00:49:10,808 INFO MainThread:323744 [wandb_init.py:init():419] starting backend
9
+ 2021-07-13 00:49:10,808 INFO MainThread:323744 [backend.py:_multiprocessing_setup():70] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
10
+ 2021-07-13 00:49:10,863 INFO MainThread:323744 [backend.py:ensure_launched():135] starting backend process...
11
+ 2021-07-13 00:49:10,917 INFO MainThread:323744 [backend.py:ensure_launched():139] started backend process with pid: 325318
12
+ 2021-07-13 00:49:10,919 INFO MainThread:323744 [wandb_init.py:init():424] backend started and connected
13
+ 2021-07-13 00:49:10,923 INFO MainThread:323744 [wandb_init.py:init():472] updated telemetry
14
+ 2021-07-13 00:49:10,924 INFO MainThread:323744 [wandb_init.py:init():491] communicating current version
15
+ 2021-07-13 00:49:11,562 INFO MainThread:323744 [wandb_init.py:init():496] got version response
16
+ 2021-07-13 00:49:11,563 INFO MainThread:323744 [wandb_init.py:init():504] communicating run to backend with 30 second timeout
17
+ 2021-07-13 00:49:11,739 INFO MainThread:323744 [wandb_init.py:init():529] starting run threads in backend
18
+ 2021-07-13 00:49:12,907 INFO MainThread:323744 [wandb_run.py:_console_start():1623] atexit reg
19
+ 2021-07-13 00:49:12,907 INFO MainThread:323744 [wandb_run.py:_redirect():1497] redirect: SettingsConsole.REDIRECT
20
+ 2021-07-13 00:49:12,908 INFO MainThread:323744 [wandb_run.py:_redirect():1502] Redirecting console.
21
+ 2021-07-13 00:49:12,910 INFO MainThread:323744 [wandb_run.py:_redirect():1558] Redirects installed.
22
+ 2021-07-13 00:49:12,910 INFO MainThread:323744 [wandb_init.py:init():554] run started, returning control to user process
23
+ 2021-07-13 00:49:12,916 INFO MainThread:323744 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 5000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul13_00-48-19_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 500, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 20000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 92768, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': True, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''}
24
+ 2021-07-13 00:49:12,917 INFO MainThread:323744 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
25
+ 2021-07-13 00:49:12,919 INFO MainThread:323744 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 64, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False}
26
+ 2021-07-13 00:51:51,794 INFO MainThread:323744 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 1
27
+ 2021-07-13 00:51:51,796 INFO MainThread:323744 [wandb_run.py:_restore():1565] restore
28
+ 2021-07-13 00:51:54,324 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
29
+ wandb_count: 1
30
+ }
31
+ pusher_stats {
32
+ uploaded_bytes: 1417
33
+ total_bytes: 1417
34
+ }
35
+
36
+ 2021-07-13 00:51:54,591 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
37
+ wandb_count: 1
38
+ }
39
+ pusher_stats {
40
+ uploaded_bytes: 1417
41
+ total_bytes: 1417
42
+ }
43
+
44
+ 2021-07-13 00:51:54,812 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
45
+ wandb_count: 4
46
+ }
47
+ pusher_stats {
48
+ uploaded_bytes: 1417
49
+ total_bytes: 40394
50
+ }
51
+
52
+ 2021-07-13 00:51:54,915 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
53
+ wandb_count: 5
54
+ }
55
+ pusher_stats {
56
+ uploaded_bytes: 1417
57
+ total_bytes: 40396
58
+ }
59
+
60
+ 2021-07-13 00:51:55,017 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
61
+ wandb_count: 5
62
+ }
63
+ pusher_stats {
64
+ uploaded_bytes: 40396
65
+ total_bytes: 40396
66
+ }
67
+
68
+ 2021-07-13 00:51:55,119 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
69
+ wandb_count: 5
70
+ }
71
+ pusher_stats {
72
+ uploaded_bytes: 40396
73
+ total_bytes: 40396
74
+ }
75
+
76
+ 2021-07-13 00:51:55,221 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
77
+ wandb_count: 5
78
+ }
79
+ pusher_stats {
80
+ uploaded_bytes: 40396
81
+ total_bytes: 40396
82
+ }
83
+
84
+ 2021-07-13 00:51:55,323 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
85
+ wandb_count: 5
86
+ }
87
+ pusher_stats {
88
+ uploaded_bytes: 40396
89
+ total_bytes: 40396
90
+ }
91
+
92
+ 2021-07-13 00:51:55,425 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
93
+ wandb_count: 5
94
+ }
95
+ pusher_stats {
96
+ uploaded_bytes: 40396
97
+ total_bytes: 40396
98
+ }
99
+
100
+ 2021-07-13 00:51:55,772 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
101
+ wandb_count: 5
102
+ }
103
+ pusher_stats {
104
+ uploaded_bytes: 40396
105
+ total_bytes: 40396
106
+ }
107
+
108
+ 2021-07-13 00:51:55,875 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: done: true
109
+ exit_result {
110
+ }
111
+ file_counts {
112
+ wandb_count: 5
113
+ }
114
+ pusher_stats {
115
+ uploaded_bytes: 40396
116
+ total_bytes: 40396
117
+ }
118
+
119
+ 2021-07-13 00:51:57,265 INFO MainThread:323744 [wandb_run.py:_show_files():1937] logging synced files
wandb/run-20210713_004910-3mu9pog5/run-3mu9pog5.wandb ADDED
Binary file (37.4 kB). View file
wandb/run-20210713_005301-2ilkub1o/files/config.yaml ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ cli_version: 0.10.33
7
+ framework: huggingface
8
+ huggingface_version: 4.9.0.dev0
9
+ is_jupyter_run: false
10
+ is_kaggle_kernel: false
11
+ python_version: 3.8.10
12
+ t:
13
+ 1:
14
+ - 3
15
+ - 11
16
+ 2:
17
+ - 3
18
+ - 11
19
+ 4: 3.8.10
20
+ 5: 0.10.33
21
+ 6: 4.9.0.dev0
22
+ 8:
23
+ - 5
24
+ adafactor:
25
+ desc: null
26
+ value: true
27
+ adam_beta1:
28
+ desc: null
29
+ value: 0.9
30
+ adam_beta2:
31
+ desc: null
32
+ value: 0.98
33
+ adam_epsilon:
34
+ desc: null
35
+ value: 1.0e-08
36
+ cache_dir:
37
+ desc: null
38
+ value: null
39
+ config_name:
40
+ desc: null
41
+ value: ./
42
+ dataloader_drop_last:
43
+ desc: null
44
+ value: false
45
+ dataloader_num_workers:
46
+ desc: null
47
+ value: 0
48
+ dataloader_pin_memory:
49
+ desc: null
50
+ value: true
51
+ dataset_config_name:
52
+ desc: null
53
+ value: null
54
+ dataset_name:
55
+ desc: null
56
+ value: null
57
+ ddp_find_unused_parameters:
58
+ desc: null
59
+ value: null
60
+ debug:
61
+ desc: null
62
+ value: []
63
+ deepspeed:
64
+ desc: null
65
+ value: null
66
+ disable_tqdm:
67
+ desc: null
68
+ value: false
69
+ do_eval:
70
+ desc: null
71
+ value: false
72
+ do_predict:
73
+ desc: null
74
+ value: false
75
+ do_train:
76
+ desc: null
77
+ value: false
78
+ dtype:
79
+ desc: null
80
+ value: bfloat16
81
+ eval_accumulation_steps:
82
+ desc: null
83
+ value: null
84
+ eval_steps:
85
+ desc: null
86
+ value: 92768
87
+ evaluation_strategy:
88
+ desc: null
89
+ value: IntervalStrategy.NO
90
+ fp16:
91
+ desc: null
92
+ value: false
93
+ fp16_backend:
94
+ desc: null
95
+ value: auto
96
+ fp16_full_eval:
97
+ desc: null
98
+ value: false
99
+ fp16_opt_level:
100
+ desc: null
101
+ value: O1
102
+ gradient_accumulation_steps:
103
+ desc: null
104
+ value: 8
105
+ greater_is_better:
106
+ desc: null
107
+ value: null
108
+ group_by_length:
109
+ desc: null
110
+ value: false
111
+ ignore_data_skip:
112
+ desc: null
113
+ value: false
114
+ label_names:
115
+ desc: null
116
+ value: null
117
+ label_smoothing_factor:
118
+ desc: null
119
+ value: 0.0
120
+ learning_rate:
121
+ desc: null
122
+ value: 5.0e-05
123
+ length_column_name:
124
+ desc: null
125
+ value: length
126
+ line_by_line:
127
+ desc: null
128
+ value: false
129
+ load_best_model_at_end:
130
+ desc: null
131
+ value: false
132
+ local_rank:
133
+ desc: null
134
+ value: -1
135
+ log_level:
136
+ desc: null
137
+ value: -1
138
+ log_level_replica:
139
+ desc: null
140
+ value: -1
141
+ log_on_each_node:
142
+ desc: null
143
+ value: true
144
+ logging_dir:
145
+ desc: null
146
+ value: ./runs/Jul13_00-52-13_t1v-n-f5c06ea1-w-0
147
+ logging_first_step:
148
+ desc: null
149
+ value: false
150
+ logging_steps:
151
+ desc: null
152
+ value: 500
153
+ logging_strategy:
154
+ desc: null
155
+ value: IntervalStrategy.STEPS
156
+ lr_scheduler_type:
157
+ desc: null
158
+ value: SchedulerType.LINEAR
159
+ max_grad_norm:
160
+ desc: null
161
+ value: 1.0
162
+ max_seq_length:
163
+ desc: null
164
+ value: 4096
165
+ max_steps:
166
+ desc: null
167
+ value: -1
168
+ metric_for_best_model:
169
+ desc: null
170
+ value: null
171
+ mlm_probability:
172
+ desc: null
173
+ value: 0.15
174
+ model_name_or_path:
175
+ desc: null
176
+ value: null
177
+ model_type:
178
+ desc: null
179
+ value: big_bird
180
+ mp_parameters:
181
+ desc: null
182
+ value: ''
183
+ no_cuda:
184
+ desc: null
185
+ value: false
186
+ num_train_epochs:
187
+ desc: null
188
+ value: 5.0
189
+ output_dir:
190
+ desc: null
191
+ value: ./
192
+ overwrite_cache:
193
+ desc: null
194
+ value: false
195
+ overwrite_output_dir:
196
+ desc: null
197
+ value: true
198
+ pad_to_max_length:
199
+ desc: null
200
+ value: false
201
+ past_index:
202
+ desc: null
203
+ value: -1
204
+ per_device_eval_batch_size:
205
+ desc: null
206
+ value: 4
207
+ per_device_train_batch_size:
208
+ desc: null
209
+ value: 4
210
+ per_gpu_eval_batch_size:
211
+ desc: null
212
+ value: null
213
+ per_gpu_train_batch_size:
214
+ desc: null
215
+ value: null
216
+ prediction_loss_only:
217
+ desc: null
218
+ value: false
219
+ preprocessing_num_workers:
220
+ desc: null
221
+ value: 64
222
+ push_to_hub:
223
+ desc: null
224
+ value: true
225
+ push_to_hub_model_id:
226
+ desc: null
227
+ value: ''
228
+ push_to_hub_organization:
229
+ desc: null
230
+ value: null
231
+ push_to_hub_token:
232
+ desc: null
233
+ value: null
234
+ remove_unused_columns:
235
+ desc: null
236
+ value: true
237
+ report_to:
238
+ desc: null
239
+ value:
240
+ - tensorboard
241
+ - wandb
242
+ resume_from_checkpoint:
243
+ desc: null
244
+ value: null
245
+ run_name:
246
+ desc: null
247
+ value: ./
248
+ save_on_each_node:
249
+ desc: null
250
+ value: false
251
+ save_steps:
252
+ desc: null
253
+ value: 20000
254
+ save_strategy:
255
+ desc: null
256
+ value: IntervalStrategy.STEPS
257
+ save_total_limit:
258
+ desc: null
259
+ value: 5
260
+ seed:
261
+ desc: null
262
+ value: 42
263
+ sharded_ddp:
264
+ desc: null
265
+ value: []
266
+ skip_memory_metrics:
267
+ desc: null
268
+ value: true
269
+ tokenizer_name:
270
+ desc: null
271
+ value: ./
272
+ tpu_metrics_debug:
273
+ desc: null
274
+ value: false
275
+ tpu_num_cores:
276
+ desc: null
277
+ value: null
278
+ train_file:
279
+ desc: null
280
+ value: null
281
+ train_ref_file:
282
+ desc: null
283
+ value: null
284
+ use_fast_tokenizer:
285
+ desc: null
286
+ value: true
287
+ use_legacy_prediction_loop:
288
+ desc: null
289
+ value: false
290
+ validation_file:
291
+ desc: null
292
+ value: null
293
+ validation_ref_file:
294
+ desc: null
295
+ value: null
296
+ validation_split_percentage:
297
+ desc: null
298
+ value: 5
299
+ warmup_ratio:
300
+ desc: null
301
+ value: 0.0
302
+ warmup_steps:
303
+ desc: null
304
+ value: 5000
305
+ weight_decay:
306
+ desc: null
307
+ value: 0.0095
wandb/run-20210713_005301-2ilkub1o/files/output.log ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /home/dat/pino/lib/python3.8/site-packages/jax/_src/numpy/lax_numpy.py:3114: UserWarning: Explicitly requested dtype <class 'jax._src.numpy.lax_numpy.int64'> requested in zeros is not available, and will be truncated to dtype int32. To enable more dtypes, set the jax_enable_x64 configuration option or the JAX_ENABLE_X64 shell environment variable. See https://github.com/google/jax#current-gotchas for more.
2
+ lax._check_user_dtype_supported(dtype, "zeros")
3
+ /home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:382: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
4
+ warnings.warn(
5
+ /home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:369: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.
6
+ warnings.warn(
7
+ Epoch ... (1/5): 0%| | 0/5 [00:00<?, ?it/s]
8
+ Epoch ... (1/5): 0%| | 0/5 [02:23<?, ?it/s]
9
+ Traceback (most recent call last):
10
+ File "./run_mlm_flax.py", line 709, in <module>
11
+ state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
12
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
13
+ return fun(*args, **kwargs)
14
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/api.py", line 1647, in f_pmapped
15
+ out = pxla.xla_pmap(
16
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1620, in bind
17
+ return call_bind(self, fun, *args, **params)
18
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1551, in call_bind
19
+ outs = primitive.process(top_trace, fun, tracers, params)
20
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1623, in process
21
+ return trace.process_map(self, fun, tracers, params)
22
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 606, in process_call
23
+ return primitive.impl(f, *tracers, **params)
24
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 624, in xla_pmap_impl
25
+ compiled_fun, fingerprint = parallel_callable(fun, backend, axis_name, axis_size,
26
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/linear_util.py", line 262, in memoized_fun
27
+ ans = call(fun, *args)
28
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 899, in parallel_callable
29
+ compiled = xla.backend_compile(backend, built, compile_options)
30
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
31
+ return backend.compile(built_c, compile_options=options)
32
+ jax._src.traceback_util.UnfilteredStackTrace: RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.61G of 15.48G hbm. Exceeded hbm capacity by 5.13G.
33
+ Total hbm usage >= 21.13G:
34
+ reserved 530.00M
35
+ program 20.61G
36
+ arguments 0B
37
+ Output size 0B; shares 0B with arguments.
38
+ Program hbm requirement 20.61G:
39
+ global 900.0K
40
+ scoped 924.0K
41
+ HLO temp 20.61G (63.0% utilization: Unpadded (12.43G) Padded (19.71G), 4.4% fragmentation (918.84M))
42
+ Largest program allocations in hbm:
43
+ 1. Size: 1.54G
44
+ Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
45
+ Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
46
+ Unpadded size: 1.54G
47
+ Extra memory due to padding: 64.0K (1.0x expansion)
48
+ XLA label: %fusion.3615.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %get-tuple-element.22628, f32[768]{0:T(1024)} %fusion.10158, f32[768]{0:T(1024)} %fusion.10159, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.20129, f32[...
49
+ Allocation type: HLO temp
50
+ ==========================
51
+ 2. Size: 360.00M
52
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
53
+ Unpadded size: 180.00M
54
+ Extra memory due to padding: 180.00M (2.0x expansion)
55
+ XLA label: %fusion.2444.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2444.remat_compressed)
56
+ Allocation type: HLO temp
57
+ ==========================
58
+ 3. Size: 360.00M
59
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
60
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
61
+ Unpadded size: 180.00M
62
+ Extra memory due to padding: 180.00M (2.0x expansion)
63
+ XLA label: %fusion.2454.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2804, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7916, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
64
+ Allocation type: HLO temp
65
+ ==========================
66
+ 4. Size: 360.00M
67
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
68
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
69
+ Unpadded size: 180.00M
70
+ Extra memory due to padding: 180.00M (2.0x expansion)
71
+ XLA label: %fusion.2453.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2803, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7915, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
72
+ Allocation type: HLO temp
73
+ ==========================
74
+ 5. Size: 360.00M
75
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
76
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
77
+ Unpadded size: 180.00M
78
+ Extra memory due to padding: 180.00M (2.0x expansion)
79
+ XLA label: %fusion.2452.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2802, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7914, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
80
+ Allocation type: HLO temp
81
+ ==========================
82
+ 6. Size: 360.00M
83
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
84
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
85
+ Unpadded size: 180.00M
86
+ Extra memory due to padding: 180.00M (2.0x expansion)
87
+ XLA label: %fusion.2451.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2801, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7913, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
88
+ Allocation type: HLO temp
89
+ ==========================
90
+ 7. Size: 360.00M
91
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
92
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
93
+ Unpadded size: 180.00M
94
+ Extra memory due to padding: 180.00M (2.0x expansion)
95
+ XLA label: %fusion.2445 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2795, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7907, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.20342, f32[4,12,60,64,192]{3,4,2,1...
96
+ Allocation type: HLO temp
97
+ ==========================
98
+ 8. Size: 360.00M
99
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
100
+ Unpadded size: 180.00M
101
+ Extra memory due to padding: 180.00M (2.0x expansion)
102
+ XLA label: %fusion.2443.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2443.remat_compressed)
103
+ Allocation type: HLO temp
104
+ ==========================
105
+ 9. Size: 360.00M
106
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
107
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
108
+ Unpadded size: 180.00M
109
+ Extra memory due to padding: 180.00M (2.0x expansion)
110
+ XLA label: %fusion.2450.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2800, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7912, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
111
+ Allocation type: HLO temp
112
+ ==========================
113
+ 10. Size: 360.00M
114
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
115
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
116
+ Unpadded size: 180.00M
117
+ Extra memory due to padding: 180.00M (2.0x expansion)
118
+ XLA label: %fusion.2449.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2799, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7911, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
119
+ Allocation type: HLO temp
120
+ ==========================
121
+ 11. Size: 360.00M
122
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
123
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
124
+ Unpadded size: 180.00M
125
+ Extra memory due to padding: 180.00M (2.0x expansion)
126
+ XLA label: %fusion.2448.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2798, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7910, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
127
+ Allocation type: HLO temp
128
+ ==========================
129
+ 12. Size: 360.00M
130
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
131
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
132
+ Unpadded size: 180.00M
133
+ Extra memory due to padding: 180.00M (2.0x expansion)
134
+ XLA label: %fusion.2447.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2797, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7909, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
135
+ Allocation type: HLO temp
136
+ ==========================
137
+ 13. Size: 360.00M
138
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
139
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
140
+ Unpadded size: 180.00M
141
+ Extra memory due to padding: 180.00M (2.0x expansion)
142
+ XLA label: %fusion.2446.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2796, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7908, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
143
+ Allocation type: HLO temp
144
+ ==========================
145
+ 14. Size: 270.00M
146
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
147
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
148
+ Unpadded size: 135.00M
149
+ Extra memory due to padding: 135.00M (2.0x expansion)
150
+ XLA label: %fusion.2689.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2964), kind=kO...
151
+ Allocation type: HLO temp
152
+ ==========================
153
+ 15. Size: 270.00M
154
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
155
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
156
+ Unpadded size: 135.00M
157
+ Extra memory due to padding: 135.00M (2.0x expansion)
158
+ XLA label: %fusion.2690.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2962), kind=kO...
159
+ Allocation type: HLO temp
160
+ ==========================
161
+ 16. Size: 270.00M
162
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
163
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
164
+ Unpadded size: 135.00M
165
+ Extra memory due to padding: 135.00M (2.0x expansion)
166
+ XLA label: %fusion.2688.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2966), kind=kO...
167
+ Allocation type: HLO temp
168
+ ==========================
169
+ 17. Size: 270.00M
170
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
171
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
172
+ Unpadded size: 135.00M
173
+ Extra memory due to padding: 135.00M (2.0x expansion)
174
+ XLA label: %fusion.2691.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14230, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2960), kind=kO...
175
+ Allocation type: HLO temp
176
+ ==========================
177
+ 18. Size: 270.00M
178
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
179
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
180
+ Unpadded size: 135.00M
181
+ Extra memory due to padding: 135.00M (2.0x expansion)
182
+ XLA label: %fusion.2692.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14164, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2958), kind=kO...
183
+ Allocation type: HLO temp
184
+ ==========================
185
+ 19. Size: 270.00M
186
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
187
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
188
+ Unpadded size: 135.00M
189
+ Extra memory due to padding: 135.00M (2.0x expansion)
190
+ XLA label: %fusion.2693.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14098, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2956), kind=kO...
191
+ Allocation type: HLO temp
192
+ ==========================
193
+ 20. Size: 270.00M
194
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
195
+ Unpadded size: 135.00M
196
+ Extra memory due to padding: 135.00M (2.0x expansion)
197
+ XLA label: %fusion.2616.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.2616.remat_compressed)
198
+ Allocation type: HLO temp
199
+ ==========================
200
+ The stack trace below excludes JAX-internal frames.
201
+ The preceding is the original exception that occurred, unmodified.
202
+ --------------------
203
+ The above exception was the direct cause of the following exception:
204
+ Traceback (most recent call last):
205
+ File "./run_mlm_flax.py", line 709, in <module>
206
+ state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
207
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
208
+ return backend.compile(built_c, compile_options=options)
209
+ RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.61G of 15.48G hbm. Exceeded hbm capacity by 5.13G.
210
+ Total hbm usage >= 21.13G:
211
+ reserved 530.00M
212
+ program 20.61G
213
+ arguments 0B
214
+ Output size 0B; shares 0B with arguments.
215
+ Program hbm requirement 20.61G:
216
+ global 900.0K
217
+ scoped 924.0K
218
+ HLO temp 20.61G (63.0% utilization: Unpadded (12.43G) Padded (19.71G), 4.4% fragmentation (918.84M))
219
+ Largest program allocations in hbm:
220
+ 1. Size: 1.54G
221
+ Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
222
+ Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
223
+ Unpadded size: 1.54G
224
+ Extra memory due to padding: 64.0K (1.0x expansion)
225
+ XLA label: %fusion.3615.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %get-tuple-element.22628, f32[768]{0:T(1024)} %fusion.10158, f32[768]{0:T(1024)} %fusion.10159, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.20129, f32[...
226
+ Allocation type: HLO temp
227
+ ==========================
228
+ 2. Size: 360.00M
229
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
230
+ Unpadded size: 180.00M
231
+ Extra memory due to padding: 180.00M (2.0x expansion)
232
+ XLA label: %fusion.2444.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2444.remat_compressed)
233
+ Allocation type: HLO temp
234
+ ==========================
235
+ 3. Size: 360.00M
236
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
237
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
238
+ Unpadded size: 180.00M
239
+ Extra memory due to padding: 180.00M (2.0x expansion)
240
+ XLA label: %fusion.2454.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2804, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7916, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
241
+ Allocation type: HLO temp
242
+ ==========================
243
+ 4. Size: 360.00M
244
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
245
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
246
+ Unpadded size: 180.00M
247
+ Extra memory due to padding: 180.00M (2.0x expansion)
248
+ XLA label: %fusion.2453.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2803, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7915, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
249
+ Allocation type: HLO temp
250
+ ==========================
251
+ 5. Size: 360.00M
252
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
253
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
254
+ Unpadded size: 180.00M
255
+ Extra memory due to padding: 180.00M (2.0x expansion)
256
+ XLA label: %fusion.2452.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2802, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7914, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
257
+ Allocation type: HLO temp
258
+ ==========================
259
+ 6. Size: 360.00M
260
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
261
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
262
+ Unpadded size: 180.00M
263
+ Extra memory due to padding: 180.00M (2.0x expansion)
264
+ XLA label: %fusion.2451.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2801, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7913, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
265
+ Allocation type: HLO temp
266
+ ==========================
267
+ 7. Size: 360.00M
268
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
269
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
270
+ Unpadded size: 180.00M
271
+ Extra memory due to padding: 180.00M (2.0x expansion)
272
+ XLA label: %fusion.2445 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2795, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7907, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.20342, f32[4,12,60,64,192]{3,4,2,1...
273
+ Allocation type: HLO temp
274
+ ==========================
275
+ 8. Size: 360.00M
276
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
277
+ Unpadded size: 180.00M
278
+ Extra memory due to padding: 180.00M (2.0x expansion)
279
+ XLA label: %fusion.2443.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2443.remat_compressed)
280
+ Allocation type: HLO temp
281
+ ==========================
282
+ 9. Size: 360.00M
283
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
284
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
285
+ Unpadded size: 180.00M
286
+ Extra memory due to padding: 180.00M (2.0x expansion)
287
+ XLA label: %fusion.2450.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2800, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7912, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
288
+ Allocation type: HLO temp
289
+ ==========================
290
+ 10. Size: 360.00M
291
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
292
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
293
+ Unpadded size: 180.00M
294
+ Extra memory due to padding: 180.00M (2.0x expansion)
295
+ XLA label: %fusion.2449.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2799, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7911, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
296
+ Allocation type: HLO temp
297
+ ==========================
298
+ 11. Size: 360.00M
299
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
300
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
301
+ Unpadded size: 180.00M
302
+ Extra memory due to padding: 180.00M (2.0x expansion)
303
+ XLA label: %fusion.2448.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2798, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7910, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
304
+ Allocation type: HLO temp
305
+ ==========================
306
+ 12. Size: 360.00M
307
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
308
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
309
+ Unpadded size: 180.00M
310
+ Extra memory due to padding: 180.00M (2.0x expansion)
311
+ XLA label: %fusion.2447.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2797, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7909, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
312
+ Allocation type: HLO temp
313
+ ==========================
314
+ 13. Size: 360.00M
315
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
316
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
317
+ Unpadded size: 180.00M
318
+ Extra memory due to padding: 180.00M (2.0x expansion)
319
+ XLA label: %fusion.2446.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2796, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7908, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
320
+ Allocation type: HLO temp
321
+ ==========================
322
+ 14. Size: 270.00M
323
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
324
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
325
+ Unpadded size: 135.00M
326
+ Extra memory due to padding: 135.00M (2.0x expansion)
327
+ XLA label: %fusion.2689.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2964), kind=kO...
328
+ Allocation type: HLO temp
329
+ ==========================
330
+ 15. Size: 270.00M
331
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
332
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
333
+ Unpadded size: 135.00M
334
+ Extra memory due to padding: 135.00M (2.0x expansion)
335
+ XLA label: %fusion.2690.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2962), kind=kO...
336
+ Allocation type: HLO temp
337
+ ==========================
338
+ 16. Size: 270.00M
339
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
340
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
341
+ Unpadded size: 135.00M
342
+ Extra memory due to padding: 135.00M (2.0x expansion)
343
+ XLA label: %fusion.2688.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2966), kind=kO...
344
+ Allocation type: HLO temp
345
+ ==========================
346
+ 17. Size: 270.00M
347
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
348
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
349
+ Unpadded size: 135.00M
350
+ Extra memory due to padding: 135.00M (2.0x expansion)
351
+ XLA label: %fusion.2691.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14230, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2960), kind=kO...
352
+ Allocation type: HLO temp
353
+ ==========================
354
+ 18. Size: 270.00M
355
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
356
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
357
+ Unpadded size: 135.00M
358
+ Extra memory due to padding: 135.00M (2.0x expansion)
359
+ XLA label: %fusion.2692.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14164, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2958), kind=kO...
360
+ Allocation type: HLO temp
361
+ ==========================
362
+ 19. Size: 270.00M
363
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
364
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
365
+ Unpadded size: 135.00M
366
+ Extra memory due to padding: 135.00M (2.0x expansion)
367
+ XLA label: %fusion.2693.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14098, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2956), kind=kO...
368
+ Allocation type: HLO temp
369
+ ==========================
370
+ 20. Size: 270.00M
371
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
372
+ Unpadded size: 135.00M
373
+ Extra memory due to padding: 135.00M (2.0x expansion)
374
+ XLA label: %fusion.2616.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.2616.remat_compressed)
375
+ Allocation type: HLO temp
376
+ ==========================
wandb/run-20210713_005301-2ilkub1o/files/requirements.txt ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==0.13.0
2
+ aiohttp==3.7.4.post0
3
+ astunparse==1.6.3
4
+ async-timeout==3.0.1
5
+ attrs==21.2.0
6
+ cachetools==4.2.2
7
+ certifi==2021.5.30
8
+ chardet==4.0.0
9
+ chex==0.0.8
10
+ click==8.0.1
11
+ configparser==5.0.2
12
+ cycler==0.10.0
13
+ datasets==1.9.1.dev0
14
+ dill==0.3.4
15
+ dm-tree==0.1.6
16
+ docker-pycreds==0.4.0
17
+ filelock==3.0.12
18
+ flatbuffers==1.12
19
+ flax==0.3.4
20
+ fsspec==2021.6.1
21
+ gast==0.4.0
22
+ gitdb==4.0.7
23
+ gitpython==3.1.18
24
+ google-auth-oauthlib==0.4.4
25
+ google-auth==1.32.1
26
+ google-pasta==0.2.0
27
+ grpcio==1.34.1
28
+ h5py==3.1.0
29
+ huggingface-hub==0.0.12
30
+ idna==2.10
31
+ jax==0.2.16
32
+ jaxlib==0.1.68
33
+ joblib==1.0.1
34
+ keras-nightly==2.5.0.dev2021032900
35
+ keras-preprocessing==1.1.2
36
+ kiwisolver==1.3.1
37
+ libtpu-nightly==0.1.dev20210615
38
+ markdown==3.3.4
39
+ matplotlib==3.4.2
40
+ msgpack==1.0.2
41
+ multidict==5.1.0
42
+ multiprocess==0.70.12.2
43
+ numpy==1.19.5
44
+ oauthlib==3.1.1
45
+ opt-einsum==3.3.0
46
+ optax==0.0.9
47
+ packaging==21.0
48
+ pandas==1.3.0
49
+ pathtools==0.1.2
50
+ pillow==8.3.1
51
+ pip==20.0.2
52
+ pkg-resources==0.0.0
53
+ promise==2.3
54
+ protobuf==3.17.3
55
+ psutil==5.8.0
56
+ pyarrow==4.0.1
57
+ pyasn1-modules==0.2.8
58
+ pyasn1==0.4.8
59
+ pyparsing==2.4.7
60
+ python-dateutil==2.8.1
61
+ pytz==2021.1
62
+ pyyaml==5.4.1
63
+ regex==2021.7.6
64
+ requests-oauthlib==1.3.0
65
+ requests==2.25.1
66
+ rsa==4.7.2
67
+ sacremoses==0.0.45
68
+ scipy==1.7.0
69
+ sentry-sdk==1.3.0
70
+ setuptools==44.0.0
71
+ shortuuid==1.0.1
72
+ six==1.15.0
73
+ smmap==4.0.0
74
+ subprocess32==3.5.4
75
+ tensorboard-data-server==0.6.1
76
+ tensorboard-plugin-wit==1.8.0
77
+ tensorboard==2.5.0
78
+ tensorflow-estimator==2.5.0
79
+ tensorflow==2.5.0
80
+ termcolor==1.1.0
81
+ tokenizers==0.10.3
82
+ toolz==0.11.1
83
+ tqdm==4.61.2
84
+ transformers==4.9.0.dev0
85
+ typing-extensions==3.7.4.3
86
+ urllib3==1.26.6
87
+ wandb==0.10.33
88
+ werkzeug==2.0.1
89
+ wheel==0.36.2
90
+ wrapt==1.12.1
91
+ xxhash==2.0.2
92
+ yarl==1.6.3
wandb/run-20210713_005301-2ilkub1o/files/wandb-metadata.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
3
+ "python": "3.8.10",
4
+ "heartbeatAt": "2021-07-13T00:53:03.462705",
5
+ "startedAt": "2021-07-13T00:53:01.400550",
6
+ "docker": null,
7
+ "cpu_count": 96,
8
+ "cuda": null,
9
+ "args": [
10
+ "--push_to_hub",
11
+ "--output_dir=./",
12
+ "--model_type=big_bird",
13
+ "--config_name=./",
14
+ "--tokenizer_name=./",
15
+ "--max_seq_length=4096",
16
+ "--weight_decay=0.0095",
17
+ "--warmup_steps=5000",
18
+ "--overwrite_output_dir",
19
+ "--adam_beta1=0.9",
20
+ "--adam_beta2=0.98",
21
+ "--logging_steps=500",
22
+ "--eval_steps=92768",
23
+ "--num_train_epochs=5",
24
+ "--preprocessing_num_workers=64",
25
+ "--save_steps=20000",
26
+ "--adafactor",
27
+ "--learning_rate=5e-5",
28
+ "--per_device_train_batch_size=4",
29
+ "--per_device_eval_batch_size=4",
30
+ "--save_total_limit=5",
31
+ "--dtype=bfloat16",
32
+ "--gradient_accumulation_steps=8"
33
+ ],
34
+ "state": "running",
35
+ "program": "./run_mlm_flax.py",
36
+ "codePath": "run_mlm_flax.py",
37
+ "git": {
38
+ "remote": "https://huggingface.co/flax-community/pino-roberta-base",
39
+ "commit": "4229c91b780cf07115cc6d04c16e393b0d2f508c"
40
+ },
41
+ "email": null,
42
+ "root": "/home/dat/pino-roberta-base",
43
+ "host": "t1v-n-f5c06ea1-w-0",
44
+ "username": "dat",
45
+ "executable": "/home/dat/pino/bin/python"
46
+ }
wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
1
+ {}
wandb/run-20210713_005301-2ilkub1o/logs/debug-internal.log ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2021-07-13 00:53:02,112 INFO MainThread:327506 [internal.py:wandb_internal():88] W&B internal server running at pid: 327506, started at: 2021-07-13 00:53:02.112234
2
+ 2021-07-13 00:53:02,114 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: check_version
3
+ 2021-07-13 00:53:02,114 INFO WriterThread:327506 [datastore.py:open_for_write():80] open: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/run-2ilkub1o.wandb
4
+ 2021-07-13 00:53:02,115 DEBUG SenderThread:327506 [sender.py:send():179] send: header
5
+ 2021-07-13 00:53:02,116 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: check_version
6
+ 2021-07-13 00:53:02,154 DEBUG SenderThread:327506 [sender.py:send():179] send: run
7
+ 2021-07-13 00:53:02,328 INFO SenderThread:327506 [dir_watcher.py:__init__():168] watching files in: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files
8
+ 2021-07-13 00:53:02,329 INFO SenderThread:327506 [sender.py:_start_run_threads():716] run started: 2ilkub1o with start time 1626137581
9
+ 2021-07-13 00:53:02,345 DEBUG SenderThread:327506 [sender.py:send():179] send: summary
10
+ 2021-07-13 00:53:02,345 INFO SenderThread:327506 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
11
+ 2021-07-13 00:53:02,346 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: run_start
12
+ 2021-07-13 00:53:03,330 INFO Thread-8 :327506 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json
13
+ 2021-07-13 00:53:03,462 DEBUG HandlerThread:327506 [meta.py:__init__():39] meta init
14
+ 2021-07-13 00:53:03,462 DEBUG HandlerThread:327506 [meta.py:__init__():53] meta init done
15
+ 2021-07-13 00:53:03,462 DEBUG HandlerThread:327506 [meta.py:probe():210] probe
16
+ 2021-07-13 00:53:03,463 DEBUG HandlerThread:327506 [meta.py:_setup_git():200] setup git
17
+ 2021-07-13 00:53:03,492 DEBUG HandlerThread:327506 [meta.py:_setup_git():207] setup git done
18
+ 2021-07-13 00:53:03,492 DEBUG HandlerThread:327506 [meta.py:_save_pip():57] save pip
19
+ 2021-07-13 00:53:03,493 DEBUG HandlerThread:327506 [meta.py:_save_pip():71] save pip done
20
+ 2021-07-13 00:53:03,493 DEBUG HandlerThread:327506 [meta.py:probe():252] probe done
21
+ 2021-07-13 00:53:03,496 DEBUG SenderThread:327506 [sender.py:send():179] send: files
22
+ 2021-07-13 00:53:03,496 INFO SenderThread:327506 [sender.py:_save_file():841] saving file wandb-metadata.json with policy now
23
+ 2021-07-13 00:53:03,504 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
24
+ 2021-07-13 00:53:03,504 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
25
+ 2021-07-13 00:53:03,635 DEBUG SenderThread:327506 [sender.py:send():179] send: config
26
+ 2021-07-13 00:53:03,636 DEBUG SenderThread:327506 [sender.py:send():179] send: config
27
+ 2021-07-13 00:53:03,636 DEBUG SenderThread:327506 [sender.py:send():179] send: config
28
+ 2021-07-13 00:53:03,952 INFO Thread-11 :327506 [upload_job.py:push():137] Uploaded file /tmp/tmpi8r4kiyhwandb/3l6ji67i-wandb-metadata.json
29
+ 2021-07-13 00:53:04,330 INFO Thread-8 :327506 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-metadata.json
30
+ 2021-07-13 00:53:04,330 INFO Thread-8 :327506 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/requirements.txt
31
+ 2021-07-13 00:53:04,330 INFO Thread-8 :327506 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
32
+ 2021-07-13 00:53:18,637 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
33
+ 2021-07-13 00:53:18,637 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
34
+ 2021-07-13 00:53:20,336 INFO Thread-8 :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
35
+ 2021-07-13 00:53:22,336 INFO Thread-8 :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
36
+ 2021-07-13 00:53:31,548 DEBUG SenderThread:327506 [sender.py:send():179] send: stats
37
+ 2021-07-13 00:53:33,340 INFO Thread-8 :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/config.yaml
38
+ 2021-07-13 00:53:33,769 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
39
+ 2021-07-13 00:53:33,769 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
40
+ 2021-07-13 00:53:48,899 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
41
+ 2021-07-13 00:53:48,899 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
42
+ 2021-07-13 00:54:01,629 DEBUG SenderThread:327506 [sender.py:send():179] send: stats
43
+ 2021-07-13 00:54:04,032 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
44
+ 2021-07-13 00:54:04,032 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
45
+ 2021-07-13 00:54:12,355 INFO Thread-8 :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
46
+ 2021-07-13 00:54:19,178 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
47
+ 2021-07-13 00:54:19,179 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
48
+ 2021-07-13 00:54:31,708 DEBUG SenderThread:327506 [sender.py:send():179] send: stats
49
+ 2021-07-13 00:54:34,599 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
50
+ 2021-07-13 00:54:34,599 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
51
+ 2021-07-13 00:54:49,798 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
52
+ 2021-07-13 00:54:49,798 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
53
+ 2021-07-13 00:55:01,792 DEBUG SenderThread:327506 [sender.py:send():179] send: stats
54
+ 2021-07-13 00:55:04,931 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
55
+ 2021-07-13 00:55:04,931 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
56
+ 2021-07-13 00:55:20,062 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
57
+ 2021-07-13 00:55:20,062 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
58
+ 2021-07-13 00:55:31,868 DEBUG SenderThread:327506 [sender.py:send():179] send: stats
59
+ 2021-07-13 00:55:35,203 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
60
+ 2021-07-13 00:55:35,204 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
61
+ 2021-07-13 00:55:44,391 INFO Thread-8 :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
62
+ 2021-07-13 00:55:45,566 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
63
+ 2021-07-13 00:55:45,567 DEBUG SenderThread:327506 [sender.py:send():179] send: telemetry
64
+ 2021-07-13 00:55:45,567 DEBUG SenderThread:327506 [sender.py:send():179] send: exit
65
+ 2021-07-13 00:55:45,567 INFO SenderThread:327506 [sender.py:send_exit():287] handling exit code: 1
66
+ 2021-07-13 00:55:45,567 INFO SenderThread:327506 [sender.py:send_exit():295] send defer
67
+ 2021-07-13 00:55:45,567 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
68
+ 2021-07-13 00:55:45,568 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
69
+ 2021-07-13 00:55:45,568 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 0
70
+ 2021-07-13 00:55:45,568 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
71
+ 2021-07-13 00:55:45,568 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 0
72
+ 2021-07-13 00:55:45,569 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 1
73
+ 2021-07-13 00:55:45,569 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
74
+ 2021-07-13 00:55:45,569 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 1
75
+ 2021-07-13 00:55:45,601 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
76
+ 2021-07-13 00:55:45,601 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 1
77
+ 2021-07-13 00:55:45,601 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 2
78
+ 2021-07-13 00:55:45,602 DEBUG SenderThread:327506 [sender.py:send():179] send: stats
79
+ 2021-07-13 00:55:45,602 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
80
+ 2021-07-13 00:55:45,602 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 2
81
+ 2021-07-13 00:55:45,602 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
82
+ 2021-07-13 00:55:45,602 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 2
83
+ 2021-07-13 00:55:45,602 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 3
84
+ 2021-07-13 00:55:45,603 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
85
+ 2021-07-13 00:55:45,603 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 3
86
+ 2021-07-13 00:55:45,603 DEBUG SenderThread:327506 [sender.py:send():179] send: summary
87
+ 2021-07-13 00:55:45,603 INFO SenderThread:327506 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
88
+ 2021-07-13 00:55:45,603 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
89
+ 2021-07-13 00:55:45,604 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 3
90
+ 2021-07-13 00:55:45,604 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 4
91
+ 2021-07-13 00:55:45,604 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
92
+ 2021-07-13 00:55:45,604 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 4
93
+ 2021-07-13 00:55:45,604 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
94
+ 2021-07-13 00:55:45,604 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 4
95
+ 2021-07-13 00:55:45,670 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
96
+ 2021-07-13 00:55:45,784 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 5
97
+ 2021-07-13 00:55:45,784 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
98
+ 2021-07-13 00:55:45,785 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
99
+ 2021-07-13 00:55:45,785 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 5
100
+ 2021-07-13 00:55:45,785 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
101
+ 2021-07-13 00:55:45,785 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 5
102
+ 2021-07-13 00:55:45,786 INFO SenderThread:327506 [dir_watcher.py:finish():282] shutting down directory watcher
103
+ 2021-07-13 00:55:45,887 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
104
+ 2021-07-13 00:55:46,391 INFO Thread-8 :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json
105
+ 2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/config.yaml
106
+ 2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
107
+ 2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:finish():312] scan: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files
108
+ 2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/requirements.txt requirements.txt
109
+ 2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log output.log
110
+ 2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-metadata.json wandb-metadata.json
111
+ 2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/config.yaml config.yaml
112
+ 2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json wandb-summary.json
113
+ 2021-07-13 00:55:46,393 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 6
114
+ 2021-07-13 00:55:46,393 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
115
+ 2021-07-13 00:55:46,403 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
116
+ 2021-07-13 00:55:46,403 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 6
117
+ 2021-07-13 00:55:46,405 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
118
+ 2021-07-13 00:55:46,405 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 6
119
+ 2021-07-13 00:55:46,405 INFO SenderThread:327506 [file_pusher.py:finish():177] shutting down file pusher
120
+ 2021-07-13 00:55:46,495 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
121
+ 2021-07-13 00:55:46,496 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
122
+ 2021-07-13 00:55:46,598 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
123
+ 2021-07-13 00:55:46,598 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
124
+ 2021-07-13 00:55:46,700 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
125
+ 2021-07-13 00:55:46,700 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
126
+ 2021-07-13 00:55:46,802 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
127
+ 2021-07-13 00:55:46,802 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
128
+ 2021-07-13 00:55:46,867 INFO Thread-14 :327506 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/config.yaml
129
+ 2021-07-13 00:55:46,874 INFO Thread-15 :327506 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json
130
+ 2021-07-13 00:55:46,876 INFO Thread-13 :327506 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
131
+ 2021-07-13 00:55:46,904 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
132
+ 2021-07-13 00:55:46,905 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
133
+ 2021-07-13 00:55:46,935 INFO Thread-12 :327506 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/requirements.txt
134
+ 2021-07-13 00:55:47,007 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
135
+ 2021-07-13 00:55:47,007 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
136
+ 2021-07-13 00:55:47,109 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
137
+ 2021-07-13 00:55:47,109 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
138
+ 2021-07-13 00:55:47,135 INFO Thread-7 :327506 [sender.py:transition_state():308] send defer: 7
139
+ 2021-07-13 00:55:47,136 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
140
+ 2021-07-13 00:55:47,136 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 7
141
+ 2021-07-13 00:55:47,136 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
142
+ 2021-07-13 00:55:47,136 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 7
143
+ 2021-07-13 00:55:47,211 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
144
+ 2021-07-13 00:55:47,415 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 8
145
+ 2021-07-13 00:55:47,416 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
146
+ 2021-07-13 00:55:47,416 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
147
+ 2021-07-13 00:55:47,416 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 8
148
+ 2021-07-13 00:55:47,416 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
149
+ 2021-07-13 00:55:47,417 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 8
150
+ 2021-07-13 00:55:47,417 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 9
151
+ 2021-07-13 00:55:47,417 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
152
+ 2021-07-13 00:55:47,417 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 9
153
+ 2021-07-13 00:55:47,417 DEBUG SenderThread:327506 [sender.py:send():179] send: final
154
+ 2021-07-13 00:55:47,417 DEBUG SenderThread:327506 [sender.py:send():179] send: footer
155
+ 2021-07-13 00:55:47,417 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
156
+ 2021-07-13 00:55:47,418 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 9
157
+ 2021-07-13 00:55:47,518 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
158
+ 2021-07-13 00:55:47,518 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
159
+ 2021-07-13 00:55:47,518 INFO SenderThread:327506 [file_pusher.py:join():182] waiting for file pusher
160
+ 2021-07-13 00:55:47,520 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: get_summary
161
+ 2021-07-13 00:55:47,521 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: sampled_history
162
+ 2021-07-13 00:55:47,521 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: shutdown
163
+ 2021-07-13 00:55:47,521 INFO HandlerThread:327506 [handler.py:finish():638] shutting down handler
164
+ 2021-07-13 00:55:48,418 INFO WriterThread:327506 [datastore.py:close():288] close: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/run-2ilkub1o.wandb
165
+ 2021-07-13 00:55:48,518 INFO SenderThread:327506 [sender.py:finish():945] shutting down sender
166
+ 2021-07-13 00:55:48,519 INFO SenderThread:327506 [file_pusher.py:finish():177] shutting down file pusher
167
+ 2021-07-13 00:55:48,519 INFO SenderThread:327506 [file_pusher.py:join():182] waiting for file pusher
168
+ 2021-07-13 00:55:48,521 INFO MainThread:327506 [internal.py:handle_exit():78] Internal process exited
wandb/run-20210713_005301-2ilkub1o/logs/debug.log ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_setup.py:_flush():69] setting env: {}
2
+ 2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_setup.py:_flush():69] setting login settings: {}
3
+ 2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_init.py:_log_setup():337] Logging user logs to /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/logs/debug.log
4
+ 2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_init.py:_log_setup():338] Logging internal logs to /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/logs/debug-internal.log
5
+ 2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_init.py:init():370] calling init triggers
6
+ 2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_init.py:init():375] wandb.init called with sweep_config: {}
7
+ config: {}
8
+ 2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_init.py:init():419] starting backend
9
+ 2021-07-13 00:53:01,402 INFO MainThread:325900 [backend.py:_multiprocessing_setup():70] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
10
+ 2021-07-13 00:53:01,457 INFO MainThread:325900 [backend.py:ensure_launched():135] starting backend process...
11
+ 2021-07-13 00:53:01,509 INFO MainThread:325900 [backend.py:ensure_launched():139] started backend process with pid: 327506
12
+ 2021-07-13 00:53:01,511 INFO MainThread:325900 [wandb_init.py:init():424] backend started and connected
13
+ 2021-07-13 00:53:01,514 INFO MainThread:325900 [wandb_init.py:init():472] updated telemetry
14
+ 2021-07-13 00:53:01,515 INFO MainThread:325900 [wandb_init.py:init():491] communicating current version
15
+ 2021-07-13 00:53:02,153 INFO MainThread:325900 [wandb_init.py:init():496] got version response
16
+ 2021-07-13 00:53:02,153 INFO MainThread:325900 [wandb_init.py:init():504] communicating run to backend with 30 second timeout
17
+ 2021-07-13 00:53:02,345 INFO MainThread:325900 [wandb_init.py:init():529] starting run threads in backend
18
+ 2021-07-13 00:53:03,501 INFO MainThread:325900 [wandb_run.py:_console_start():1623] atexit reg
19
+ 2021-07-13 00:53:03,501 INFO MainThread:325900 [wandb_run.py:_redirect():1497] redirect: SettingsConsole.REDIRECT
20
+ 2021-07-13 00:53:03,502 INFO MainThread:325900 [wandb_run.py:_redirect():1502] Redirecting console.
21
+ 2021-07-13 00:53:03,504 INFO MainThread:325900 [wandb_run.py:_redirect():1558] Redirects installed.
22
+ 2021-07-13 00:53:03,504 INFO MainThread:325900 [wandb_init.py:init():554] run started, returning control to user process
23
+ 2021-07-13 00:53:03,510 INFO MainThread:325900 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 5000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul13_00-52-13_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 500, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 20000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 92768, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': True, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''}
24
+ 2021-07-13 00:53:03,512 INFO MainThread:325900 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
25
+ 2021-07-13 00:53:03,513 INFO MainThread:325900 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 64, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False}
26
+ 2021-07-13 00:55:43,384 INFO MainThread:325900 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 1
27
+ 2021-07-13 00:55:43,385 INFO MainThread:325900 [wandb_run.py:_restore():1565] restore
28
+ 2021-07-13 00:55:45,569 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
29
+ wandb_count: 1
30
+ }
31
+ pusher_stats {
32
+ uploaded_bytes: 1417
33
+ total_bytes: 1417
34
+ }
35
+
36
+ 2021-07-13 00:55:45,785 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
37
+ wandb_count: 1
38
+ }
39
+ pusher_stats {
40
+ uploaded_bytes: 1417
41
+ total_bytes: 1417
42
+ }
43
+
44
+ 2021-07-13 00:55:46,394 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
45
+ wandb_count: 4
46
+ }
47
+ pusher_stats {
48
+ uploaded_bytes: 1417
49
+ total_bytes: 40394
50
+ }
51
+
52
+ 2021-07-13 00:55:46,496 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
53
+ wandb_count: 5
54
+ }
55
+ pusher_stats {
56
+ uploaded_bytes: 1417
57
+ total_bytes: 40396
58
+ }
59
+
60
+ 2021-07-13 00:55:46,598 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
61
+ wandb_count: 5
62
+ }
63
+ pusher_stats {
64
+ uploaded_bytes: 40396
65
+ total_bytes: 40396
66
+ }
67
+
68
+ 2021-07-13 00:55:46,701 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
69
+ wandb_count: 5
70
+ }
71
+ pusher_stats {
72
+ uploaded_bytes: 40396
73
+ total_bytes: 40396
74
+ }
75
+
76
+ 2021-07-13 00:55:46,803 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
77
+ wandb_count: 5
78
+ }
79
+ pusher_stats {
80
+ uploaded_bytes: 40396
81
+ total_bytes: 40396
82
+ }
83
+
84
+ 2021-07-13 00:55:46,905 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
85
+ wandb_count: 5
86
+ }
87
+ pusher_stats {
88
+ uploaded_bytes: 40396
89
+ total_bytes: 40396
90
+ }
91
+
92
+ 2021-07-13 00:55:47,008 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
93
+ wandb_count: 5
94
+ }
95
+ pusher_stats {
96
+ uploaded_bytes: 40396
97
+ total_bytes: 40396
98
+ }
99
+
100
+ 2021-07-13 00:55:47,109 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
101
+ wandb_count: 5
102
+ }
103
+ pusher_stats {
104
+ uploaded_bytes: 40396
105
+ total_bytes: 40396
106
+ }
107
+
108
+ 2021-07-13 00:55:47,416 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
109
+ wandb_count: 5
110
+ }
111
+ pusher_stats {
112
+ uploaded_bytes: 40396
113
+ total_bytes: 40396
114
+ }
115
+
116
+ 2021-07-13 00:55:47,519 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: done: true
117
+ exit_result {
118
+ }
119
+ file_counts {
120
+ wandb_count: 5
121
+ }
122
+ pusher_stats {
123
+ uploaded_bytes: 40396
124
+ total_bytes: 40396
125
+ }
126
+
127
+ 2021-07-13 00:55:48,779 INFO MainThread:325900 [wandb_run.py:_show_files():1937] logging synced files
wandb/run-20210713_005301-2ilkub1o/run-2ilkub1o.wandb ADDED
Binary file (37.4 kB). View file
wandb/run-20210713_005751-1wnn0lyf/files/config.yaml ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ cli_version: 0.10.33
7
+ framework: huggingface
8
+ huggingface_version: 4.9.0.dev0
9
+ is_jupyter_run: false
10
+ is_kaggle_kernel: false
11
+ python_version: 3.8.10
12
+ t:
13
+ 1:
14
+ - 3
15
+ - 11
16
+ 4: 3.8.10
17
+ 5: 0.10.33
18
+ 6: 4.9.0.dev0
19
+ 8:
20
+ - 5
21
+ adafactor:
22
+ desc: null
23
+ value: false
24
+ adam_beta1:
25
+ desc: null
26
+ value: 0.9
27
+ adam_beta2:
28
+ desc: null
29
+ value: 0.98
30
+ adam_epsilon:
31
+ desc: null
32
+ value: 1.0e-08
33
+ cache_dir:
34
+ desc: null
35
+ value: null
36
+ config_name:
37
+ desc: null
38
+ value: ./
39
+ dataloader_drop_last:
40
+ desc: null
41
+ value: false
42
+ dataloader_num_workers:
43
+ desc: null
44
+ value: 0
45
+ dataloader_pin_memory:
46
+ desc: null
47
+ value: true
48
+ dataset_config_name:
49
+ desc: null
50
+ value: null
51
+ dataset_name:
52
+ desc: null
53
+ value: null
54
+ ddp_find_unused_parameters:
55
+ desc: null
56
+ value: null
57
+ debug:
58
+ desc: null
59
+ value: []
60
+ deepspeed:
61
+ desc: null
62
+ value: null
63
+ disable_tqdm:
64
+ desc: null
65
+ value: false
66
+ do_eval:
67
+ desc: null
68
+ value: false
69
+ do_predict:
70
+ desc: null
71
+ value: false
72
+ do_train:
73
+ desc: null
74
+ value: false
75
+ dtype:
76
+ desc: null
77
+ value: bfloat16
78
+ eval_accumulation_steps:
79
+ desc: null
80
+ value: null
81
+ eval_steps:
82
+ desc: null
83
+ value: 92768
84
+ evaluation_strategy:
85
+ desc: null
86
+ value: IntervalStrategy.NO
87
+ fp16:
88
+ desc: null
89
+ value: false
90
+ fp16_backend:
91
+ desc: null
92
+ value: auto
93
+ fp16_full_eval:
94
+ desc: null
95
+ value: false
96
+ fp16_opt_level:
97
+ desc: null
98
+ value: O1
99
+ gradient_accumulation_steps:
100
+ desc: null
101
+ value: 1
102
+ greater_is_better:
103
+ desc: null
104
+ value: null
105
+ group_by_length:
106
+ desc: null
107
+ value: false
108
+ ignore_data_skip:
109
+ desc: null
110
+ value: false
111
+ label_names:
112
+ desc: null
113
+ value: null
114
+ label_smoothing_factor:
115
+ desc: null
116
+ value: 0.0
117
+ learning_rate:
118
+ desc: null
119
+ value: 5.0e-05
120
+ length_column_name:
121
+ desc: null
122
+ value: length
123
+ line_by_line:
124
+ desc: null
125
+ value: false
126
+ load_best_model_at_end:
127
+ desc: null
128
+ value: false
129
+ local_rank:
130
+ desc: null
131
+ value: -1
132
+ log_level:
133
+ desc: null
134
+ value: -1
135
+ log_level_replica:
136
+ desc: null
137
+ value: -1
138
+ log_on_each_node:
139
+ desc: null
140
+ value: true
141
+ logging_dir:
142
+ desc: null
143
+ value: ./runs/Jul13_00-57-01_t1v-n-f5c06ea1-w-0
144
+ logging_first_step:
145
+ desc: null
146
+ value: false
147
+ logging_steps:
148
+ desc: null
149
+ value: 500
150
+ logging_strategy:
151
+ desc: null
152
+ value: IntervalStrategy.STEPS
153
+ lr_scheduler_type:
154
+ desc: null
155
+ value: SchedulerType.LINEAR
156
+ max_grad_norm:
157
+ desc: null
158
+ value: 1.0
159
+ max_seq_length:
160
+ desc: null
161
+ value: 4096
162
+ max_steps:
163
+ desc: null
164
+ value: -1
165
+ metric_for_best_model:
166
+ desc: null
167
+ value: null
168
+ mlm_probability:
169
+ desc: null
170
+ value: 0.15
171
+ model_name_or_path:
172
+ desc: null
173
+ value: null
174
+ model_type:
175
+ desc: null
176
+ value: big_bird
177
+ mp_parameters:
178
+ desc: null
179
+ value: ''
180
+ no_cuda:
181
+ desc: null
182
+ value: false
183
+ num_train_epochs:
184
+ desc: null
185
+ value: 5.0
186
+ output_dir:
187
+ desc: null
188
+ value: ./
189
+ overwrite_cache:
190
+ desc: null
191
+ value: false
192
+ overwrite_output_dir:
193
+ desc: null
194
+ value: true
195
+ pad_to_max_length:
196
+ desc: null
197
+ value: false
198
+ past_index:
199
+ desc: null
200
+ value: -1
201
+ per_device_eval_batch_size:
202
+ desc: null
203
+ value: 4
204
+ per_device_train_batch_size:
205
+ desc: null
206
+ value: 4
207
+ per_gpu_eval_batch_size:
208
+ desc: null
209
+ value: null
210
+ per_gpu_train_batch_size:
211
+ desc: null
212
+ value: null
213
+ prediction_loss_only:
214
+ desc: null
215
+ value: false
216
+ preprocessing_num_workers:
217
+ desc: null
218
+ value: 64
219
+ push_to_hub:
220
+ desc: null
221
+ value: true
222
+ push_to_hub_model_id:
223
+ desc: null
224
+ value: ''
225
+ push_to_hub_organization:
226
+ desc: null
227
+ value: null
228
+ push_to_hub_token:
229
+ desc: null
230
+ value: null
231
+ remove_unused_columns:
232
+ desc: null
233
+ value: true
234
+ report_to:
235
+ desc: null
236
+ value:
237
+ - tensorboard
238
+ - wandb
239
+ resume_from_checkpoint:
240
+ desc: null
241
+ value: null
242
+ run_name:
243
+ desc: null
244
+ value: ./
245
+ save_on_each_node:
246
+ desc: null
247
+ value: false
248
+ save_steps:
249
+ desc: null
250
+ value: 20000
251
+ save_strategy:
252
+ desc: null
253
+ value: IntervalStrategy.STEPS
254
+ save_total_limit:
255
+ desc: null
256
+ value: 5
257
+ seed:
258
+ desc: null
259
+ value: 42
260
+ sharded_ddp:
261
+ desc: null
262
+ value: []
263
+ skip_memory_metrics:
264
+ desc: null
265
+ value: true
266
+ tokenizer_name:
267
+ desc: null
268
+ value: ./
269
+ tpu_metrics_debug:
270
+ desc: null
271
+ value: false
272
+ tpu_num_cores:
273
+ desc: null
274
+ value: null
275
+ train_file:
276
+ desc: null
277
+ value: null
278
+ train_ref_file:
279
+ desc: null
280
+ value: null
281
+ use_fast_tokenizer:
282
+ desc: null
283
+ value: true
284
+ use_legacy_prediction_loop:
285
+ desc: null
286
+ value: false
287
+ validation_file:
288
+ desc: null
289
+ value: null
290
+ validation_ref_file:
291
+ desc: null
292
+ value: null
293
+ validation_split_percentage:
294
+ desc: null
295
+ value: 5
296
+ warmup_ratio:
297
+ desc: null
298
+ value: 0.0
299
+ warmup_steps:
300
+ desc: null
301
+ value: 5000
302
+ weight_decay:
303
+ desc: null
304
+ value: 0.0095
wandb/run-20210713_005751-1wnn0lyf/files/output.log ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:382: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
2
+ warnings.warn(
3
+ /home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:369: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.
4
+ warnings.warn(
5
+ Epoch ... (1/5): 0%| | 0/5 [00:00<?, ?it/s]
6
+ Traceback (most recent call last): | 0/46383 [00:00<?, ?it/s]
7
+ File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
8
+ self.run()
9
+ File "/usr/lib/python3.8/threading.py", line 870, in run
10
+ self._target(*self._args, **self._kwargs)
11
+ File "/home/dat/pino/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 183, in check_network_status
12
+ status_response = self._interface.communicate_network_status()
13
+ File "/home/dat/pino/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 755, in communicate_network_status
14
+ resp = self._communicate(req, timeout=timeout, local=True)
15
+ File "/home/dat/pino/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate
16
+ return self._communicate_async(rec, local=local).get(timeout=timeout)
17
+ File "/home/dat/pino/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async
18
+ raise Exception("The wandb backend process has shutdown")
19
+ Exception: The wandb backend process has shutdown
20
+ Training...: 0%| | 0/46383 [01:25<?, ?it/s]
21
+ Epoch ... (1/5): 0%| | 0/5 [02:13<?, ?it/s]
22
+ Traceback (most recent call last):
23
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
24
+ return fun(*args, **kwargs)
25
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/api.py", line 1647, in f_pmapped
26
+ out = pxla.xla_pmap(
27
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1620, in bind
28
+ return call_bind(self, fun, *args, **params)
29
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1551, in call_bind
30
+ outs = primitive.process(top_trace, fun, tracers, params)
31
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1623, in process
32
+ return trace.process_map(self, fun, tracers, params)
33
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 606, in process_call
34
+ return primitive.impl(f, *tracers, **params)
35
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 624, in xla_pmap_impl
36
+ compiled_fun, fingerprint = parallel_callable(fun, backend, axis_name, axis_size,
37
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/linear_util.py", line 262, in memoized_fun
38
+ ans = call(fun, *args)
39
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 899, in parallel_callable
40
+ compiled = xla.backend_compile(backend, built, compile_options)
41
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
42
+ return backend.compile(built_c, compile_options=options)
43
+ RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.30G of 15.48G hbm. Exceeded hbm capacity by 4.82G.
44
+ Total hbm usage >= 20.82G:
45
+ reserved 530.00M
46
+ program 20.30G
47
+ arguments 0B
48
+ Output size 0B; shares 0B with arguments.
49
+ Program hbm requirement 20.30G:
50
+ global 660.0K
51
+ scoped 125.0K
52
+ HLO temp 20.30G (63.5% utilization: Unpadded (12.44G) Padded (19.60G), 3.5% fragmentation (717.54M))
53
+ Largest program allocations in hbm:
54
+ 1. Size: 1.54G
55
+ Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
56
+ Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
57
+ Unpadded size: 1.54G
58
+ Extra memory due to padding: 64.0K (1.0x expansion)
59
+ XLA label: %fusion.1304.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %copy.16213, f32[768]{0:T(1024)} %fusion.8859, f32[768]{0:T(1024)} %fusion.8860, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.16597, f32[4,4096]{1,0:T(4...
60
+ Allocation type: HLO temp
61
+ ==========================
62
+ 2. Size: 360.00M
63
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
64
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
65
+ Unpadded size: 180.00M
66
+ Extra memory due to padding: 180.00M (2.0x expansion)
67
+ XLA label: %fusion.135 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.485, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5710, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.16812, f32[4,12,60,64,192]{3,4,2,1,0...
68
+ Allocation type: HLO temp
69
+ ==========================
70
+ 3. Size: 360.00M
71
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
72
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
73
+ Unpadded size: 180.00M
74
+ Extra memory due to padding: 180.00M (2.0x expansion)
75
+ XLA label: %fusion.144.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.494, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5719, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
76
+ Allocation type: HLO temp
77
+ ==========================
78
+ 4. Size: 360.00M
79
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
80
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
81
+ Unpadded size: 180.00M
82
+ Extra memory due to padding: 180.00M (2.0x expansion)
83
+ XLA label: %fusion.143.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.493, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5718, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
84
+ Allocation type: HLO temp
85
+ ==========================
86
+ 5. Size: 360.00M
87
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
88
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
89
+ Unpadded size: 180.00M
90
+ Extra memory due to padding: 180.00M (2.0x expansion)
91
+ XLA label: %fusion.142.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.492, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5717, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
92
+ Allocation type: HLO temp
93
+ ==========================
94
+ 6. Size: 360.00M
95
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
96
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
97
+ Unpadded size: 180.00M
98
+ Extra memory due to padding: 180.00M (2.0x expansion)
99
+ XLA label: %fusion.141.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.491, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5716, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
100
+ Allocation type: HLO temp
101
+ ==========================
102
+ 7. Size: 360.00M
103
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
104
+ Unpadded size: 180.00M
105
+ Extra memory due to padding: 180.00M (2.0x expansion)
106
+ XLA label: %fusion.134.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.134.remat_compressed)
107
+ Allocation type: HLO temp
108
+ ==========================
109
+ 8. Size: 360.00M
110
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
111
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
112
+ Unpadded size: 180.00M
113
+ Extra memory due to padding: 180.00M (2.0x expansion)
114
+ XLA label: %fusion.140.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.490, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5715, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
115
+ Allocation type: HLO temp
116
+ ==========================
117
+ 9. Size: 360.00M
118
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
119
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
120
+ Unpadded size: 180.00M
121
+ Extra memory due to padding: 180.00M (2.0x expansion)
122
+ XLA label: %fusion.139.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.489, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5714, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
123
+ Allocation type: HLO temp
124
+ ==========================
125
+ 10. Size: 360.00M
126
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
127
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
128
+ Unpadded size: 180.00M
129
+ Extra memory due to padding: 180.00M (2.0x expansion)
130
+ XLA label: %fusion.138.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.488, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5713, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
131
+ Allocation type: HLO temp
132
+ ==========================
133
+ 11. Size: 360.00M
134
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
135
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
136
+ Unpadded size: 180.00M
137
+ Extra memory due to padding: 180.00M (2.0x expansion)
138
+ XLA label: %fusion.137.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.487, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5712, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
139
+ Allocation type: HLO temp
140
+ ==========================
141
+ 12. Size: 360.00M
142
+ Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
143
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
144
+ Unpadded size: 180.00M
145
+ Extra memory due to padding: 180.00M (2.0x expansion)
146
+ XLA label: %fusion.136.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.486, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5711, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
147
+ Allocation type: HLO temp
148
+ ==========================
149
+ 13. Size: 360.00M
150
+ Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
151
+ Unpadded size: 180.00M
152
+ Extra memory due to padding: 180.00M (2.0x expansion)
153
+ XLA label: %fusion.133.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.133.remat_compressed)
154
+ Allocation type: HLO temp
155
+ ==========================
156
+ 14. Size: 270.00M
157
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
158
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
159
+ Unpadded size: 135.00M
160
+ Extra memory due to padding: 135.00M (2.0x expansion)
161
+ XLA label: %fusion.378.remat5 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.17038, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.655), kind=kOut...
162
+ Allocation type: HLO temp
163
+ ==========================
164
+ 15. Size: 270.00M
165
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
166
+ Unpadded size: 135.00M
167
+ Extra memory due to padding: 135.00M (2.0x expansion)
168
+ XLA label: %fusion.310.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.310.remat_compressed)
169
+ Allocation type: HLO temp
170
+ ==========================
171
+ 16. Size: 270.00M
172
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
173
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
174
+ Unpadded size: 135.00M
175
+ Extra memory due to padding: 135.00M (2.0x expansion)
176
+ XLA label: %fusion.386.remat6 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.17038, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.13900, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.639), kind=kOut...
177
+ Allocation type: HLO temp
178
+ ==========================
179
+ 17. Size: 270.00M
180
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
181
+ Unpadded size: 135.00M
182
+ Extra memory due to padding: 135.00M (2.0x expansion)
183
+ XLA label: %fusion.326.remat_uncompressed.remat2 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.326.remat_compressed)
184
+ Allocation type: HLO temp
185
+ ==========================
186
+ 18. Size: 270.00M
187
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=591
188
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
189
+ Unpadded size: 135.00M
190
+ Extra memory due to padding: 135.00M (2.0x expansion)
191
+ XLA label: %fusion.10361 = (f32[4,12,60,64]{3,2,1,0:T(8,128)}, f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}) fusion(s32[4,12,62,64,192]{3,4,2,1,0:T(8,128)} %get-tuple-element.18295, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14494, bf16[4,12,60,192,64]{3,2,1,0,4:T...
192
+ Allocation type: HLO temp
193
+ ==========================
194
+ 19. Size: 270.00M
195
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
196
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
197
+ Unpadded size: 135.00M
198
+ Extra memory due to padding: 135.00M (2.0x expansion)
199
+ XLA label: %fusion.380.remat5 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.17038, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.651), kind=kOut...
200
+ Allocation type: HLO temp
201
+ ==========================
202
+ 20. Size: 270.00M
203
+ Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
204
+ Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
205
+ Unpadded size: 135.00M
206
+ Extra memory due to padding: 135.00M (2.0x expansion)
207
+ XLA label: %fusion.379.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.17038, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.653), kind=kOut...
208
+ Allocation type: HLO temp
209
+ ==========================
210
+ During handling of the above exception, another exception occurred:
211
+ Traceback (most recent call last):
212
+ File "./run_mlm_flax.py", line 709, in <module>
213
+ state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
214
+ File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
215
+ return fun(*args, **kwargs)
216
+ KeyboardInterrupt
wandb/run-20210713_005751-1wnn0lyf/files/requirements.txt ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==0.13.0
2
+ aiohttp==3.7.4.post0
3
+ astunparse==1.6.3
4
+ async-timeout==3.0.1
5
+ attrs==21.2.0
6
+ cachetools==4.2.2
7
+ certifi==2021.5.30
8
+ chardet==4.0.0
9
+ chex==0.0.8
10
+ click==8.0.1
11
+ configparser==5.0.2
12
+ cycler==0.10.0
13
+ datasets==1.9.1.dev0
14
+ dill==0.3.4
15
+ dm-tree==0.1.6
16
+ docker-pycreds==0.4.0
17
+ filelock==3.0.12
18
+ flatbuffers==1.12
19
+ flax==0.3.4
20
+ fsspec==2021.6.1
21
+ gast==0.4.0
22
+ gitdb==4.0.7
23
+ gitpython==3.1.18
24
+ google-auth-oauthlib==0.4.4
25
+ google-auth==1.32.1
26
+ google-pasta==0.2.0
27
+ grpcio==1.34.1
28
+ h5py==3.1.0
29
+ huggingface-hub==0.0.12
30
+ idna==2.10
31
+ jax==0.2.16
32
+ jaxlib==0.1.68
33
+ joblib==1.0.1
34
+ keras-nightly==2.5.0.dev2021032900
35
+ keras-preprocessing==1.1.2
36
+ kiwisolver==1.3.1
37
+ libtpu-nightly==0.1.dev20210615
38
+ markdown==3.3.4
39
+ matplotlib==3.4.2
40
+ msgpack==1.0.2
41
+ multidict==5.1.0
42
+ multiprocess==0.70.12.2
43
+ numpy==1.19.5
44
+ oauthlib==3.1.1
45
+ opt-einsum==3.3.0
46
+ optax==0.0.9
47
+ packaging==21.0
48
+ pandas==1.3.0
49
+ pathtools==0.1.2
50
+ pillow==8.3.1
51
+ pip==20.0.2
52
+ pkg-resources==0.0.0
53
+ promise==2.3
54
+ protobuf==3.17.3
55
+ psutil==5.8.0
56
+ pyarrow==4.0.1
57
+ pyasn1-modules==0.2.8
58
+ pyasn1==0.4.8
59
+ pyparsing==2.4.7
60
+ python-dateutil==2.8.1
61
+ pytz==2021.1
62
+ pyyaml==5.4.1
63
+ regex==2021.7.6
64
+ requests-oauthlib==1.3.0
65
+ requests==2.25.1
66
+ rsa==4.7.2
67
+ sacremoses==0.0.45
68
+ scipy==1.7.0
69
+ sentry-sdk==1.3.0
70
+ setuptools==44.0.0
71
+ shortuuid==1.0.1
72
+ six==1.15.0
73
+ smmap==4.0.0
74
+ subprocess32==3.5.4
75
+ tensorboard-data-server==0.6.1
76
+ tensorboard-plugin-wit==1.8.0
77
+ tensorboard==2.5.0
78
+ tensorflow-estimator==2.5.0
79
+ tensorflow==2.5.0
80
+ termcolor==1.1.0
81
+ tokenizers==0.10.3
82
+ toolz==0.11.1
83
+ tqdm==4.61.2
84
+ transformers==4.9.0.dev0
85
+ typing-extensions==3.7.4.3
86
+ urllib3==1.26.6
87
+ wandb==0.10.33
88
+ werkzeug==2.0.1
89
+ wheel==0.36.2
90
+ wrapt==1.12.1
91
+ xxhash==2.0.2
92
+ yarl==1.6.3
wandb/run-20210713_005751-1wnn0lyf/files/wandb-metadata.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
3
+ "python": "3.8.10",
4
+ "heartbeatAt": "2021-07-13T00:57:53.965536",
5
+ "startedAt": "2021-07-13T00:57:51.918634",
6
+ "docker": null,
7
+ "cpu_count": 96,
8
+ "cuda": null,
9
+ "args": [
10
+ "--push_to_hub",
11
+ "--output_dir=./",
12
+ "--model_type=big_bird",
13
+ "--config_name=./",
14
+ "--tokenizer_name=./",
15
+ "--max_seq_length=4096",
16
+ "--weight_decay=0.0095",
17
+ "--warmup_steps=5000",
18
+ "--overwrite_output_dir",
19
+ "--adam_beta1=0.9",
20
+ "--adam_beta2=0.98",
21
+ "--logging_steps=500",
22
+ "--eval_steps=92768",
23
+ "--num_train_epochs=5",
24
+ "--preprocessing_num_workers=64",
25
+ "--save_steps=20000",
26
+ "--learning_rate=5e-5",
27
+ "--per_device_train_batch_size=4",
28
+ "--per_device_eval_batch_size=4",
29
+ "--save_total_limit=5",
30
+ "--dtype=bfloat16"
31
+ ],
32
+ "state": "running",
33
+ "program": "./run_mlm_flax.py",
34
+ "codePath": "run_mlm_flax.py",
35
+ "git": {
36
+ "remote": "https://huggingface.co/flax-community/pino-roberta-base",
37
+ "commit": "4229c91b780cf07115cc6d04c16e393b0d2f508c"
38
+ },
39
+ "email": null,
40
+ "root": "/home/dat/pino-roberta-base",
41
+ "host": "t1v-n-f5c06ea1-w-0",
42
+ "username": "dat",
43
+ "executable": "/home/dat/pino/bin/python"
44
+ }
wandb/run-20210713_005751-1wnn0lyf/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
1
+ {}
wandb/run-20210713_005751-1wnn0lyf/logs/debug-internal.log ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2021-07-13 00:57:52,645 INFO MainThread:329334 [internal.py:wandb_internal():88] W&B internal server running at pid: 329334, started at: 2021-07-13 00:57:52.644860
2
+ 2021-07-13 00:57:52,647 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: check_version
3
+ 2021-07-13 00:57:52,647 INFO WriterThread:329334 [datastore.py:open_for_write():80] open: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/run-1wnn0lyf.wandb
4
+ 2021-07-13 00:57:52,648 DEBUG SenderThread:329334 [sender.py:send():179] send: header
5
+ 2021-07-13 00:57:52,648 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: check_version
6
+ 2021-07-13 00:57:52,687 DEBUG SenderThread:329334 [sender.py:send():179] send: run
7
+ 2021-07-13 00:57:52,862 INFO SenderThread:329334 [dir_watcher.py:__init__():168] watching files in: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files
8
+ 2021-07-13 00:57:52,862 INFO SenderThread:329334 [sender.py:_start_run_threads():716] run started: 1wnn0lyf with start time 1626137872
9
+ 2021-07-13 00:57:52,862 DEBUG SenderThread:329334 [sender.py:send():179] send: summary
10
+ 2021-07-13 00:57:52,862 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: run_start
11
+ 2021-07-13 00:57:52,863 INFO SenderThread:329334 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
12
+ 2021-07-13 00:57:53,865 INFO Thread-8 :329334 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/wandb-summary.json
13
+ 2021-07-13 00:57:53,965 DEBUG HandlerThread:329334 [meta.py:__init__():39] meta init
14
+ 2021-07-13 00:57:53,965 DEBUG HandlerThread:329334 [meta.py:__init__():53] meta init done
15
+ 2021-07-13 00:57:53,965 DEBUG HandlerThread:329334 [meta.py:probe():210] probe
16
+ 2021-07-13 00:57:53,966 DEBUG HandlerThread:329334 [meta.py:_setup_git():200] setup git
17
+ 2021-07-13 00:57:53,996 DEBUG HandlerThread:329334 [meta.py:_setup_git():207] setup git done
18
+ 2021-07-13 00:57:53,996 DEBUG HandlerThread:329334 [meta.py:_save_pip():57] save pip
19
+ 2021-07-13 00:57:53,996 DEBUG HandlerThread:329334 [meta.py:_save_pip():71] save pip done
20
+ 2021-07-13 00:57:53,996 DEBUG HandlerThread:329334 [meta.py:probe():252] probe done
21
+ 2021-07-13 00:57:53,999 DEBUG SenderThread:329334 [sender.py:send():179] send: files
22
+ 2021-07-13 00:57:53,999 INFO SenderThread:329334 [sender.py:_save_file():841] saving file wandb-metadata.json with policy now
23
+ 2021-07-13 00:57:54,007 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
24
+ 2021-07-13 00:57:54,007 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
25
+ 2021-07-13 00:57:54,134 DEBUG SenderThread:329334 [sender.py:send():179] send: config
26
+ 2021-07-13 00:57:54,135 DEBUG SenderThread:329334 [sender.py:send():179] send: config
27
+ 2021-07-13 00:57:54,135 DEBUG SenderThread:329334 [sender.py:send():179] send: config
28
+ 2021-07-13 00:57:54,460 INFO Thread-11 :329334 [upload_job.py:push():137] Uploaded file /tmp/tmpbiuftyldwandb/b3fet9y4-wandb-metadata.json
29
+ 2021-07-13 00:57:54,864 INFO Thread-8 :329334 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/output.log
30
+ 2021-07-13 00:57:54,864 INFO Thread-8 :329334 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/wandb-metadata.json
31
+ 2021-07-13 00:57:54,864 INFO Thread-8 :329334 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/requirements.txt
32
+ 2021-07-13 00:58:09,136 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
33
+ 2021-07-13 00:58:09,136 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
34
+ 2021-07-13 00:58:10,870 INFO Thread-8 :329334 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/output.log
35
+ 2021-07-13 00:58:22,050 DEBUG SenderThread:329334 [sender.py:send():179] send: stats
36
+ 2021-07-13 00:58:23,875 INFO Thread-8 :329334 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/config.yaml
37
+ 2021-07-13 00:58:24,269 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
38
+ 2021-07-13 00:58:24,269 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
39
+ 2021-07-13 00:58:39,402 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
40
+ 2021-07-13 00:58:39,403 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
41
+ 2021-07-13 00:58:52,130 DEBUG SenderThread:329334 [sender.py:send():179] send: stats
42
+ 2021-07-13 00:58:54,537 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
43
+ 2021-07-13 00:58:54,537 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
44
+ 2021-07-13 00:59:00,888 INFO Thread-8 :329334 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/output.log
45
+ 2021-07-13 00:59:09,682 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
46
+ 2021-07-13 00:59:09,683 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
47
+ 2021-07-13 00:59:22,209 DEBUG SenderThread:329334 [sender.py:send():179] send: stats
48
+ 2021-07-13 00:59:24,837 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
49
+ 2021-07-13 00:59:24,837 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
50
+ 2021-07-13 00:59:39,971 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
51
+ 2021-07-13 00:59:39,971 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
52
+ 2021-07-13 00:59:52,289 DEBUG SenderThread:329334 [sender.py:send():179] send: stats
53
+ 2021-07-13 00:59:55,105 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
54
+ 2021-07-13 00:59:55,105 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
55
+ 2021-07-13 01:00:10,158 WARNING MainThread:329334 [internal.py:wandb_internal():147] Internal process interrupt: 1
56
+ 2021-07-13 01:00:10,246 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
57
+ 2021-07-13 01:00:10,246 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
58
+ 2021-07-13 01:00:11,893 WARNING MainThread:329334 [internal.py:wandb_internal():147] Internal process interrupt: 2
59
+ 2021-07-13 01:00:11,893 ERROR MainThread:329334 [internal.py:wandb_internal():150] Internal process interrupted.
60
+ 2021-07-13 01:00:12,253 INFO HandlerThread:329334 [handler.py:finish():638] shutting down handler
61
+ 2021-07-13 01:00:12,281 INFO MainThread:329334 [internal.py:handle_exit():78] Internal process exited
wandb/run-20210713_005751-1wnn0lyf/logs/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_setup.py:_flush():69] setting env: {}
2
+ 2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_setup.py:_flush():69] setting login settings: {}
3
+ 2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_init.py:_log_setup():337] Logging user logs to /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/logs/debug.log
4
+ 2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_init.py:_log_setup():338] Logging internal logs to /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/logs/debug-internal.log
5
+ 2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_init.py:init():370] calling init triggers
6
+ 2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_init.py:init():375] wandb.init called with sweep_config: {}
7
+ config: {}
8
+ 2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_init.py:init():419] starting backend
9
+ 2021-07-13 00:57:51,920 INFO MainThread:327810 [backend.py:_multiprocessing_setup():70] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
10
+ 2021-07-13 00:57:51,997 INFO MainThread:327810 [backend.py:ensure_launched():135] starting backend process...
11
+ 2021-07-13 00:57:52,047 INFO MainThread:327810 [backend.py:ensure_launched():139] started backend process with pid: 329334
12
+ 2021-07-13 00:57:52,050 INFO MainThread:327810 [wandb_init.py:init():424] backend started and connected
13
+ 2021-07-13 00:57:52,053 INFO MainThread:327810 [wandb_init.py:init():472] updated telemetry
14
+ 2021-07-13 00:57:52,054 INFO MainThread:327810 [wandb_init.py:init():491] communicating current version
15
+ 2021-07-13 00:57:52,686 INFO MainThread:327810 [wandb_init.py:init():496] got version response
16
+ 2021-07-13 00:57:52,686 INFO MainThread:327810 [wandb_init.py:init():504] communicating run to backend with 30 second timeout
17
+ 2021-07-13 00:57:52,861 INFO MainThread:327810 [wandb_init.py:init():529] starting run threads in backend
18
+ 2021-07-13 00:57:54,003 INFO MainThread:327810 [wandb_run.py:_console_start():1623] atexit reg
19
+ 2021-07-13 00:57:54,004 INFO MainThread:327810 [wandb_run.py:_redirect():1497] redirect: SettingsConsole.REDIRECT
20
+ 2021-07-13 00:57:54,004 INFO MainThread:327810 [wandb_run.py:_redirect():1502] Redirecting console.
21
+ 2021-07-13 00:57:54,006 INFO MainThread:327810 [wandb_run.py:_redirect():1558] Redirects installed.
22
+ 2021-07-13 00:57:54,006 INFO MainThread:327810 [wandb_init.py:init():554] run started, returning control to user process
23
+ 2021-07-13 00:57:54,012 INFO MainThread:327810 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 5000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul13_00-57-01_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 500, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 20000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 92768, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''}
24
+ 2021-07-13 00:57:54,014 INFO MainThread:327810 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
25
+ 2021-07-13 00:57:54,016 INFO MainThread:327810 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 64, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False}
26
+ 2021-07-13 01:00:22,944 INFO MainThread:327810 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 255
27
+ 2021-07-13 01:00:22,945 INFO MainThread:327810 [wandb_run.py:_restore():1565] restore
28
+ 2021-07-13 01:00:25,397 INFO MainThread:327810 [wandb_run.py:_restore():1565] restore
wandb/run-20210713_005751-1wnn0lyf/run-1wnn0lyf.wandb ADDED
Binary file (3.98 kB). View file