dat
commited on
Commit
•
9bd1fec
1
Parent(s):
4229c91
add all
Browse files- config.json +1 -1
- events.out.tfevents.1626137349.t1v-n-f5c06ea1-w-0.323744.3.v2 +3 -0
- events.out.tfevents.1626137580.t1v-n-f5c06ea1-w-0.325900.3.v2 +3 -0
- events.out.tfevents.1626137871.t1v-n-f5c06ea1-w-0.327810.3.v2 +3 -0
- run.sh +3 -2
- run_mlm_flax.py +10 -10
- wandb/debug-internal.log +1 -1
- wandb/debug.log +1 -1
- wandb/latest-run +1 -1
- wandb/run-20210713_004910-3mu9pog5/files/config.yaml +307 -0
- wandb/run-20210713_004910-3mu9pog5/files/output.log +376 -0
- wandb/run-20210713_004910-3mu9pog5/files/requirements.txt +92 -0
- wandb/run-20210713_004910-3mu9pog5/files/wandb-metadata.json +46 -0
- wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json +1 -0
- wandb/run-20210713_004910-3mu9pog5/logs/debug-internal.log +166 -0
- wandb/run-20210713_004910-3mu9pog5/logs/debug.log +119 -0
- wandb/run-20210713_004910-3mu9pog5/run-3mu9pog5.wandb +0 -0
- wandb/run-20210713_005301-2ilkub1o/files/config.yaml +307 -0
- wandb/run-20210713_005301-2ilkub1o/files/output.log +376 -0
- wandb/run-20210713_005301-2ilkub1o/files/requirements.txt +92 -0
- wandb/run-20210713_005301-2ilkub1o/files/wandb-metadata.json +46 -0
- wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json +1 -0
- wandb/run-20210713_005301-2ilkub1o/logs/debug-internal.log +168 -0
- wandb/run-20210713_005301-2ilkub1o/logs/debug.log +127 -0
- wandb/run-20210713_005301-2ilkub1o/run-2ilkub1o.wandb +0 -0
- wandb/run-20210713_005751-1wnn0lyf/files/config.yaml +304 -0
- wandb/run-20210713_005751-1wnn0lyf/files/output.log +216 -0
- wandb/run-20210713_005751-1wnn0lyf/files/requirements.txt +92 -0
- wandb/run-20210713_005751-1wnn0lyf/files/wandb-metadata.json +44 -0
- wandb/run-20210713_005751-1wnn0lyf/files/wandb-summary.json +1 -0
- wandb/run-20210713_005751-1wnn0lyf/logs/debug-internal.log +61 -0
- wandb/run-20210713_005751-1wnn0lyf/logs/debug.log +28 -0
- wandb/run-20210713_005751-1wnn0lyf/run-1wnn0lyf.wandb +0 -0
config.json
CHANGED
@@ -4,7 +4,7 @@
|
|
4 |
],
|
5 |
"attention_probs_dropout_prob": 0.1,
|
6 |
"attention_type": "block_sparse",
|
7 |
-
"block_size":
|
8 |
"bos_token_id": 1,
|
9 |
"eos_token_id": 2,
|
10 |
"gradient_checkpointing": false,
|
|
|
4 |
],
|
5 |
"attention_probs_dropout_prob": 0.1,
|
6 |
"attention_type": "block_sparse",
|
7 |
+
"block_size": 128,
|
8 |
"bos_token_id": 1,
|
9 |
"eos_token_id": 2,
|
10 |
"gradient_checkpointing": false,
|
events.out.tfevents.1626137349.t1v-n-f5c06ea1-w-0.323744.3.v2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f159e4108512bc68b8363ca06b6026ff0844d045b08ba76516f2764b90277292
|
3 |
+
size 40
|
events.out.tfevents.1626137580.t1v-n-f5c06ea1-w-0.325900.3.v2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72867ca0c2d013977242562e1efa683ba957c1b4c3352c0547c72dcd0e611de8
|
3 |
+
size 40
|
events.out.tfevents.1626137871.t1v-n-f5c06ea1-w-0.327810.3.v2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f4e6248f4d9c467b9b97ff9829c4847d2f568eaf3b4c6b79865519f1e98780a9
|
3 |
+
size 40
|
run.sh
CHANGED
@@ -19,12 +19,13 @@ python ./run_mlm_flax.py \
|
|
19 |
--num_train_epochs="5" \
|
20 |
--preprocessing_num_workers="64" \
|
21 |
--save_steps="20000" \
|
22 |
-
--adafactor \
|
23 |
--learning_rate="5e-5" \
|
24 |
--per_device_train_batch_size="2" \
|
25 |
--per_device_eval_batch_size="2" \
|
26 |
--save_total_limit="5"\
|
27 |
--dtype="bfloat16" \
|
|
|
|
|
28 |
#--resume_from_checkpoint="./"\
|
29 |
-
|
30 |
|
|
|
19 |
--num_train_epochs="5" \
|
20 |
--preprocessing_num_workers="64" \
|
21 |
--save_steps="20000" \
|
|
|
22 |
--learning_rate="5e-5" \
|
23 |
--per_device_train_batch_size="2" \
|
24 |
--per_device_eval_batch_size="2" \
|
25 |
--save_total_limit="5"\
|
26 |
--dtype="bfloat16" \
|
27 |
+
#--adafactor \
|
28 |
+
#--gradient_accumulation_steps="8" \
|
29 |
#--resume_from_checkpoint="./"\
|
30 |
+
|
31 |
|
run_mlm_flax.py
CHANGED
@@ -563,7 +563,7 @@ if __name__ == "__main__":
|
|
563 |
|
564 |
# Store some constant
|
565 |
num_epochs = int(training_args.num_train_epochs)
|
566 |
-
train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
|
567 |
eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
|
568 |
|
569 |
num_train_steps = len(train_dataset) // train_batch_size * num_epochs
|
@@ -610,9 +610,9 @@ if __name__ == "__main__":
|
|
610 |
mask=decay_mask_fn,
|
611 |
)
|
612 |
|
613 |
-
if training_args.gradient_accumulation_steps > 1:
|
614 |
-
|
615 |
-
grad_accum_steps = training_args.gradient_accumulation_steps
|
616 |
|
617 |
# Setup train state
|
618 |
|
@@ -650,7 +650,7 @@ if __name__ == "__main__":
|
|
650 |
new_state = state.apply_gradients(grads=grad)
|
651 |
|
652 |
metrics = jax.lax.pmean(
|
653 |
-
{"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step
|
654 |
)
|
655 |
|
656 |
return new_state, metrics, new_dropout_rng
|
@@ -696,10 +696,10 @@ if __name__ == "__main__":
|
|
696 |
# Generate an epoch by shuffling sampling indices from the train dataset
|
697 |
num_train_samples = len(train_dataset)
|
698 |
train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
|
699 |
-
train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size
|
700 |
|
701 |
# Gather the indexes for creating the batch and do a training step
|
702 |
-
for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1,initial=resume_step
|
703 |
samples = [train_dataset[int(idx)] for idx in batch_idx]
|
704 |
model_inputs = data_collator(samples, pad_to_multiple_of=16)
|
705 |
|
@@ -713,7 +713,7 @@ if __name__ == "__main__":
|
|
713 |
if cur_step < resume_step:
|
714 |
continue
|
715 |
|
716 |
-
if (cur_step % training_args.logging_steps
|
717 |
# Save metrics
|
718 |
train_metric = jax_utils.unreplicate(train_metric)
|
719 |
train_time += time.time() - train_start
|
@@ -730,7 +730,7 @@ if __name__ == "__main__":
|
|
730 |
|
731 |
train_metrics = []
|
732 |
|
733 |
-
if cur_step % (training_args.eval_steps
|
734 |
# ======================== Evaluating ==============================
|
735 |
num_eval_samples = len(eval_dataset)
|
736 |
eval_samples_idx = jnp.arange(num_eval_samples)
|
@@ -763,7 +763,7 @@ if __name__ == "__main__":
|
|
763 |
_metrics = {f"eval_{k}":mb_item(v) for k, v in eval_metrics.items()}
|
764 |
wandb.log({"eval_step":cur_step, **_metrics})
|
765 |
|
766 |
-
if (cur_step % training_args.save_steps == 0
|
767 |
# save checkpoint after each epoch and push checkpoint to the hub
|
768 |
if jax.process_index() == 0:
|
769 |
params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
|
|
|
563 |
|
564 |
# Store some constant
|
565 |
num_epochs = int(training_args.num_train_epochs)
|
566 |
+
train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() #* training_args.gradient_accumulation_steps
|
567 |
eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
|
568 |
|
569 |
num_train_steps = len(train_dataset) // train_batch_size * num_epochs
|
|
|
610 |
mask=decay_mask_fn,
|
611 |
)
|
612 |
|
613 |
+
#if training_args.gradient_accumulation_steps > 1:
|
614 |
+
# optimizer = optax.MultiSteps(optimizer, training_args.gradient_accumulation_steps)
|
615 |
+
#grad_accum_steps = training_args.gradient_accumulation_steps
|
616 |
|
617 |
# Setup train state
|
618 |
|
|
|
650 |
new_state = state.apply_gradients(grads=grad)
|
651 |
|
652 |
metrics = jax.lax.pmean(
|
653 |
+
{"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step )}, axis_name="batch" #// grad_accum_steps
|
654 |
)
|
655 |
|
656 |
return new_state, metrics, new_dropout_rng
|
|
|
696 |
# Generate an epoch by shuffling sampling indices from the train dataset
|
697 |
num_train_samples = len(train_dataset)
|
698 |
train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
|
699 |
+
train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size) #// grad_accum_steps
|
700 |
|
701 |
# Gather the indexes for creating the batch and do a training step
|
702 |
+
for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1,initial=resume_step)): #// grad_accum_steps
|
703 |
samples = [train_dataset[int(idx)] for idx in batch_idx]
|
704 |
model_inputs = data_collator(samples, pad_to_multiple_of=16)
|
705 |
|
|
|
713 |
if cur_step < resume_step:
|
714 |
continue
|
715 |
|
716 |
+
if (cur_step % training_args.logging_steps) == 0 and cur_step > 0: #* grad_accum_steps
|
717 |
# Save metrics
|
718 |
train_metric = jax_utils.unreplicate(train_metric)
|
719 |
train_time += time.time() - train_start
|
|
|
730 |
|
731 |
train_metrics = []
|
732 |
|
733 |
+
if cur_step % (training_args.eval_steps) == 0 and cur_step > 0: #* grad_accum_steps
|
734 |
# ======================== Evaluating ==============================
|
735 |
num_eval_samples = len(eval_dataset)
|
736 |
eval_samples_idx = jnp.arange(num_eval_samples)
|
|
|
763 |
_metrics = {f"eval_{k}":mb_item(v) for k, v in eval_metrics.items()}
|
764 |
wandb.log({"eval_step":cur_step, **_metrics})
|
765 |
|
766 |
+
if (cur_step % training_args.save_steps == 0) and cur_step > 0: #* grad_accum_steps
|
767 |
# save checkpoint after each epoch and push checkpoint to the hub
|
768 |
if jax.process_index() == 0:
|
769 |
params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
|
wandb/debug-internal.log
CHANGED
@@ -1 +1 @@
|
|
1 |
-
run-
|
|
|
1 |
+
run-20210713_005751-1wnn0lyf/logs/debug-internal.log
|
wandb/debug.log
CHANGED
@@ -1 +1 @@
|
|
1 |
-
run-
|
|
|
1 |
+
run-20210713_005751-1wnn0lyf/logs/debug.log
|
wandb/latest-run
CHANGED
@@ -1 +1 @@
|
|
1 |
-
run-
|
|
|
1 |
+
run-20210713_005751-1wnn0lyf
|
wandb/run-20210713_004910-3mu9pog5/files/config.yaml
ADDED
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
_wandb:
|
4 |
+
desc: null
|
5 |
+
value:
|
6 |
+
cli_version: 0.10.33
|
7 |
+
framework: huggingface
|
8 |
+
huggingface_version: 4.9.0.dev0
|
9 |
+
is_jupyter_run: false
|
10 |
+
is_kaggle_kernel: false
|
11 |
+
python_version: 3.8.10
|
12 |
+
t:
|
13 |
+
1:
|
14 |
+
- 3
|
15 |
+
- 11
|
16 |
+
2:
|
17 |
+
- 3
|
18 |
+
- 11
|
19 |
+
4: 3.8.10
|
20 |
+
5: 0.10.33
|
21 |
+
6: 4.9.0.dev0
|
22 |
+
8:
|
23 |
+
- 5
|
24 |
+
adafactor:
|
25 |
+
desc: null
|
26 |
+
value: true
|
27 |
+
adam_beta1:
|
28 |
+
desc: null
|
29 |
+
value: 0.9
|
30 |
+
adam_beta2:
|
31 |
+
desc: null
|
32 |
+
value: 0.98
|
33 |
+
adam_epsilon:
|
34 |
+
desc: null
|
35 |
+
value: 1.0e-08
|
36 |
+
cache_dir:
|
37 |
+
desc: null
|
38 |
+
value: null
|
39 |
+
config_name:
|
40 |
+
desc: null
|
41 |
+
value: ./
|
42 |
+
dataloader_drop_last:
|
43 |
+
desc: null
|
44 |
+
value: false
|
45 |
+
dataloader_num_workers:
|
46 |
+
desc: null
|
47 |
+
value: 0
|
48 |
+
dataloader_pin_memory:
|
49 |
+
desc: null
|
50 |
+
value: true
|
51 |
+
dataset_config_name:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
dataset_name:
|
55 |
+
desc: null
|
56 |
+
value: null
|
57 |
+
ddp_find_unused_parameters:
|
58 |
+
desc: null
|
59 |
+
value: null
|
60 |
+
debug:
|
61 |
+
desc: null
|
62 |
+
value: []
|
63 |
+
deepspeed:
|
64 |
+
desc: null
|
65 |
+
value: null
|
66 |
+
disable_tqdm:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
do_eval:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
do_predict:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
do_train:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
dtype:
|
79 |
+
desc: null
|
80 |
+
value: bfloat16
|
81 |
+
eval_accumulation_steps:
|
82 |
+
desc: null
|
83 |
+
value: null
|
84 |
+
eval_steps:
|
85 |
+
desc: null
|
86 |
+
value: 92768
|
87 |
+
evaluation_strategy:
|
88 |
+
desc: null
|
89 |
+
value: IntervalStrategy.NO
|
90 |
+
fp16:
|
91 |
+
desc: null
|
92 |
+
value: false
|
93 |
+
fp16_backend:
|
94 |
+
desc: null
|
95 |
+
value: auto
|
96 |
+
fp16_full_eval:
|
97 |
+
desc: null
|
98 |
+
value: false
|
99 |
+
fp16_opt_level:
|
100 |
+
desc: null
|
101 |
+
value: O1
|
102 |
+
gradient_accumulation_steps:
|
103 |
+
desc: null
|
104 |
+
value: 8
|
105 |
+
greater_is_better:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
group_by_length:
|
109 |
+
desc: null
|
110 |
+
value: false
|
111 |
+
ignore_data_skip:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
label_names:
|
115 |
+
desc: null
|
116 |
+
value: null
|
117 |
+
label_smoothing_factor:
|
118 |
+
desc: null
|
119 |
+
value: 0.0
|
120 |
+
learning_rate:
|
121 |
+
desc: null
|
122 |
+
value: 5.0e-05
|
123 |
+
length_column_name:
|
124 |
+
desc: null
|
125 |
+
value: length
|
126 |
+
line_by_line:
|
127 |
+
desc: null
|
128 |
+
value: false
|
129 |
+
load_best_model_at_end:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
local_rank:
|
133 |
+
desc: null
|
134 |
+
value: -1
|
135 |
+
log_level:
|
136 |
+
desc: null
|
137 |
+
value: -1
|
138 |
+
log_level_replica:
|
139 |
+
desc: null
|
140 |
+
value: -1
|
141 |
+
log_on_each_node:
|
142 |
+
desc: null
|
143 |
+
value: true
|
144 |
+
logging_dir:
|
145 |
+
desc: null
|
146 |
+
value: ./runs/Jul13_00-48-19_t1v-n-f5c06ea1-w-0
|
147 |
+
logging_first_step:
|
148 |
+
desc: null
|
149 |
+
value: false
|
150 |
+
logging_steps:
|
151 |
+
desc: null
|
152 |
+
value: 500
|
153 |
+
logging_strategy:
|
154 |
+
desc: null
|
155 |
+
value: IntervalStrategy.STEPS
|
156 |
+
lr_scheduler_type:
|
157 |
+
desc: null
|
158 |
+
value: SchedulerType.LINEAR
|
159 |
+
max_grad_norm:
|
160 |
+
desc: null
|
161 |
+
value: 1.0
|
162 |
+
max_seq_length:
|
163 |
+
desc: null
|
164 |
+
value: 4096
|
165 |
+
max_steps:
|
166 |
+
desc: null
|
167 |
+
value: -1
|
168 |
+
metric_for_best_model:
|
169 |
+
desc: null
|
170 |
+
value: null
|
171 |
+
mlm_probability:
|
172 |
+
desc: null
|
173 |
+
value: 0.15
|
174 |
+
model_name_or_path:
|
175 |
+
desc: null
|
176 |
+
value: null
|
177 |
+
model_type:
|
178 |
+
desc: null
|
179 |
+
value: big_bird
|
180 |
+
mp_parameters:
|
181 |
+
desc: null
|
182 |
+
value: ''
|
183 |
+
no_cuda:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
num_train_epochs:
|
187 |
+
desc: null
|
188 |
+
value: 5.0
|
189 |
+
output_dir:
|
190 |
+
desc: null
|
191 |
+
value: ./
|
192 |
+
overwrite_cache:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
overwrite_output_dir:
|
196 |
+
desc: null
|
197 |
+
value: true
|
198 |
+
pad_to_max_length:
|
199 |
+
desc: null
|
200 |
+
value: false
|
201 |
+
past_index:
|
202 |
+
desc: null
|
203 |
+
value: -1
|
204 |
+
per_device_eval_batch_size:
|
205 |
+
desc: null
|
206 |
+
value: 4
|
207 |
+
per_device_train_batch_size:
|
208 |
+
desc: null
|
209 |
+
value: 4
|
210 |
+
per_gpu_eval_batch_size:
|
211 |
+
desc: null
|
212 |
+
value: null
|
213 |
+
per_gpu_train_batch_size:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
prediction_loss_only:
|
217 |
+
desc: null
|
218 |
+
value: false
|
219 |
+
preprocessing_num_workers:
|
220 |
+
desc: null
|
221 |
+
value: 64
|
222 |
+
push_to_hub:
|
223 |
+
desc: null
|
224 |
+
value: true
|
225 |
+
push_to_hub_model_id:
|
226 |
+
desc: null
|
227 |
+
value: ''
|
228 |
+
push_to_hub_organization:
|
229 |
+
desc: null
|
230 |
+
value: null
|
231 |
+
push_to_hub_token:
|
232 |
+
desc: null
|
233 |
+
value: null
|
234 |
+
remove_unused_columns:
|
235 |
+
desc: null
|
236 |
+
value: true
|
237 |
+
report_to:
|
238 |
+
desc: null
|
239 |
+
value:
|
240 |
+
- tensorboard
|
241 |
+
- wandb
|
242 |
+
resume_from_checkpoint:
|
243 |
+
desc: null
|
244 |
+
value: null
|
245 |
+
run_name:
|
246 |
+
desc: null
|
247 |
+
value: ./
|
248 |
+
save_on_each_node:
|
249 |
+
desc: null
|
250 |
+
value: false
|
251 |
+
save_steps:
|
252 |
+
desc: null
|
253 |
+
value: 20000
|
254 |
+
save_strategy:
|
255 |
+
desc: null
|
256 |
+
value: IntervalStrategy.STEPS
|
257 |
+
save_total_limit:
|
258 |
+
desc: null
|
259 |
+
value: 5
|
260 |
+
seed:
|
261 |
+
desc: null
|
262 |
+
value: 42
|
263 |
+
sharded_ddp:
|
264 |
+
desc: null
|
265 |
+
value: []
|
266 |
+
skip_memory_metrics:
|
267 |
+
desc: null
|
268 |
+
value: true
|
269 |
+
tokenizer_name:
|
270 |
+
desc: null
|
271 |
+
value: ./
|
272 |
+
tpu_metrics_debug:
|
273 |
+
desc: null
|
274 |
+
value: false
|
275 |
+
tpu_num_cores:
|
276 |
+
desc: null
|
277 |
+
value: null
|
278 |
+
train_file:
|
279 |
+
desc: null
|
280 |
+
value: null
|
281 |
+
train_ref_file:
|
282 |
+
desc: null
|
283 |
+
value: null
|
284 |
+
use_fast_tokenizer:
|
285 |
+
desc: null
|
286 |
+
value: true
|
287 |
+
use_legacy_prediction_loop:
|
288 |
+
desc: null
|
289 |
+
value: false
|
290 |
+
validation_file:
|
291 |
+
desc: null
|
292 |
+
value: null
|
293 |
+
validation_ref_file:
|
294 |
+
desc: null
|
295 |
+
value: null
|
296 |
+
validation_split_percentage:
|
297 |
+
desc: null
|
298 |
+
value: 5
|
299 |
+
warmup_ratio:
|
300 |
+
desc: null
|
301 |
+
value: 0.0
|
302 |
+
warmup_steps:
|
303 |
+
desc: null
|
304 |
+
value: 5000
|
305 |
+
weight_decay:
|
306 |
+
desc: null
|
307 |
+
value: 0.0095
|
wandb/run-20210713_004910-3mu9pog5/files/output.log
ADDED
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/home/dat/pino/lib/python3.8/site-packages/jax/_src/numpy/lax_numpy.py:3114: UserWarning: Explicitly requested dtype <class 'jax._src.numpy.lax_numpy.int64'> requested in zeros is not available, and will be truncated to dtype int32. To enable more dtypes, set the jax_enable_x64 configuration option or the JAX_ENABLE_X64 shell environment variable. See https://github.com/google/jax#current-gotchas for more.
|
2 |
+
lax._check_user_dtype_supported(dtype, "zeros")
|
3 |
+
/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:382: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
|
4 |
+
warnings.warn(
|
5 |
+
/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:369: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.
|
6 |
+
warnings.warn(
|
7 |
+
Epoch ... (1/5): 0%| | 0/5 [00:00<?, ?it/s]
|
8 |
+
Epoch ... (1/5): 0%| | 0/5 [02:22<?, ?it/s]
|
9 |
+
Traceback (most recent call last):
|
10 |
+
File "./run_mlm_flax.py", line 709, in <module>
|
11 |
+
state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
|
12 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
|
13 |
+
return fun(*args, **kwargs)
|
14 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/api.py", line 1647, in f_pmapped
|
15 |
+
out = pxla.xla_pmap(
|
16 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1620, in bind
|
17 |
+
return call_bind(self, fun, *args, **params)
|
18 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1551, in call_bind
|
19 |
+
outs = primitive.process(top_trace, fun, tracers, params)
|
20 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1623, in process
|
21 |
+
return trace.process_map(self, fun, tracers, params)
|
22 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 606, in process_call
|
23 |
+
return primitive.impl(f, *tracers, **params)
|
24 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 624, in xla_pmap_impl
|
25 |
+
compiled_fun, fingerprint = parallel_callable(fun, backend, axis_name, axis_size,
|
26 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/linear_util.py", line 262, in memoized_fun
|
27 |
+
ans = call(fun, *args)
|
28 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 899, in parallel_callable
|
29 |
+
compiled = xla.backend_compile(backend, built, compile_options)
|
30 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
|
31 |
+
return backend.compile(built_c, compile_options=options)
|
32 |
+
jax._src.traceback_util.UnfilteredStackTrace: RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.61G of 15.48G hbm. Exceeded hbm capacity by 5.13G.
|
33 |
+
Total hbm usage >= 21.13G:
|
34 |
+
reserved 530.00M
|
35 |
+
program 20.61G
|
36 |
+
arguments 0B
|
37 |
+
Output size 0B; shares 0B with arguments.
|
38 |
+
Program hbm requirement 20.61G:
|
39 |
+
global 900.0K
|
40 |
+
scoped 924.0K
|
41 |
+
HLO temp 20.61G (63.0% utilization: Unpadded (12.43G) Padded (19.71G), 4.4% fragmentation (918.84M))
|
42 |
+
Largest program allocations in hbm:
|
43 |
+
1. Size: 1.54G
|
44 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
|
45 |
+
Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
|
46 |
+
Unpadded size: 1.54G
|
47 |
+
Extra memory due to padding: 64.0K (1.0x expansion)
|
48 |
+
XLA label: %fusion.3615.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %get-tuple-element.22628, f32[768]{0:T(1024)} %fusion.10158, f32[768]{0:T(1024)} %fusion.10159, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.20129, f32[...
|
49 |
+
Allocation type: HLO temp
|
50 |
+
==========================
|
51 |
+
2. Size: 360.00M
|
52 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
53 |
+
Unpadded size: 180.00M
|
54 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
55 |
+
XLA label: %fusion.2444.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2444.remat_compressed)
|
56 |
+
Allocation type: HLO temp
|
57 |
+
==========================
|
58 |
+
3. Size: 360.00M
|
59 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
60 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
61 |
+
Unpadded size: 180.00M
|
62 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
63 |
+
XLA label: %fusion.2454.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2804, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7916, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
64 |
+
Allocation type: HLO temp
|
65 |
+
==========================
|
66 |
+
4. Size: 360.00M
|
67 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
68 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
69 |
+
Unpadded size: 180.00M
|
70 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
71 |
+
XLA label: %fusion.2453.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2803, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7915, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
72 |
+
Allocation type: HLO temp
|
73 |
+
==========================
|
74 |
+
5. Size: 360.00M
|
75 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
76 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
77 |
+
Unpadded size: 180.00M
|
78 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
79 |
+
XLA label: %fusion.2452.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2802, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7914, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
80 |
+
Allocation type: HLO temp
|
81 |
+
==========================
|
82 |
+
6. Size: 360.00M
|
83 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
84 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
85 |
+
Unpadded size: 180.00M
|
86 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
87 |
+
XLA label: %fusion.2451.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2801, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7913, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
88 |
+
Allocation type: HLO temp
|
89 |
+
==========================
|
90 |
+
7. Size: 360.00M
|
91 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
92 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
93 |
+
Unpadded size: 180.00M
|
94 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
95 |
+
XLA label: %fusion.2445 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2795, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7907, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.20342, f32[4,12,60,64,192]{3,4,2,1...
|
96 |
+
Allocation type: HLO temp
|
97 |
+
==========================
|
98 |
+
8. Size: 360.00M
|
99 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
100 |
+
Unpadded size: 180.00M
|
101 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
102 |
+
XLA label: %fusion.2443.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2443.remat_compressed)
|
103 |
+
Allocation type: HLO temp
|
104 |
+
==========================
|
105 |
+
9. Size: 360.00M
|
106 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
107 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
108 |
+
Unpadded size: 180.00M
|
109 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
110 |
+
XLA label: %fusion.2450.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2800, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7912, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
111 |
+
Allocation type: HLO temp
|
112 |
+
==========================
|
113 |
+
10. Size: 360.00M
|
114 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
115 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
116 |
+
Unpadded size: 180.00M
|
117 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
118 |
+
XLA label: %fusion.2449.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2799, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7911, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
119 |
+
Allocation type: HLO temp
|
120 |
+
==========================
|
121 |
+
11. Size: 360.00M
|
122 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
123 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
124 |
+
Unpadded size: 180.00M
|
125 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
126 |
+
XLA label: %fusion.2448.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2798, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7910, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
127 |
+
Allocation type: HLO temp
|
128 |
+
==========================
|
129 |
+
12. Size: 360.00M
|
130 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
131 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
132 |
+
Unpadded size: 180.00M
|
133 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
134 |
+
XLA label: %fusion.2447.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2797, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7909, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
135 |
+
Allocation type: HLO temp
|
136 |
+
==========================
|
137 |
+
13. Size: 360.00M
|
138 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
139 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
140 |
+
Unpadded size: 180.00M
|
141 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
142 |
+
XLA label: %fusion.2446.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2796, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7908, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
143 |
+
Allocation type: HLO temp
|
144 |
+
==========================
|
145 |
+
14. Size: 270.00M
|
146 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
147 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
148 |
+
Unpadded size: 135.00M
|
149 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
150 |
+
XLA label: %fusion.2689.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2964), kind=kO...
|
151 |
+
Allocation type: HLO temp
|
152 |
+
==========================
|
153 |
+
15. Size: 270.00M
|
154 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
155 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
156 |
+
Unpadded size: 135.00M
|
157 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
158 |
+
XLA label: %fusion.2690.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2962), kind=kO...
|
159 |
+
Allocation type: HLO temp
|
160 |
+
==========================
|
161 |
+
16. Size: 270.00M
|
162 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
163 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
164 |
+
Unpadded size: 135.00M
|
165 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
166 |
+
XLA label: %fusion.2688.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2966), kind=kO...
|
167 |
+
Allocation type: HLO temp
|
168 |
+
==========================
|
169 |
+
17. Size: 270.00M
|
170 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
171 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
172 |
+
Unpadded size: 135.00M
|
173 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
174 |
+
XLA label: %fusion.2691.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14230, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2960), kind=kO...
|
175 |
+
Allocation type: HLO temp
|
176 |
+
==========================
|
177 |
+
18. Size: 270.00M
|
178 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
179 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
180 |
+
Unpadded size: 135.00M
|
181 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
182 |
+
XLA label: %fusion.2692.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14164, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2958), kind=kO...
|
183 |
+
Allocation type: HLO temp
|
184 |
+
==========================
|
185 |
+
19. Size: 270.00M
|
186 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
187 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
188 |
+
Unpadded size: 135.00M
|
189 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
190 |
+
XLA label: %fusion.2693.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14098, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2956), kind=kO...
|
191 |
+
Allocation type: HLO temp
|
192 |
+
==========================
|
193 |
+
20. Size: 270.00M
|
194 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
195 |
+
Unpadded size: 135.00M
|
196 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
197 |
+
XLA label: %fusion.2616.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.2616.remat_compressed)
|
198 |
+
Allocation type: HLO temp
|
199 |
+
==========================
|
200 |
+
The stack trace below excludes JAX-internal frames.
|
201 |
+
The preceding is the original exception that occurred, unmodified.
|
202 |
+
--------------------
|
203 |
+
The above exception was the direct cause of the following exception:
|
204 |
+
Traceback (most recent call last):
|
205 |
+
File "./run_mlm_flax.py", line 709, in <module>
|
206 |
+
state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
|
207 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
|
208 |
+
return backend.compile(built_c, compile_options=options)
|
209 |
+
RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.61G of 15.48G hbm. Exceeded hbm capacity by 5.13G.
|
210 |
+
Total hbm usage >= 21.13G:
|
211 |
+
reserved 530.00M
|
212 |
+
program 20.61G
|
213 |
+
arguments 0B
|
214 |
+
Output size 0B; shares 0B with arguments.
|
215 |
+
Program hbm requirement 20.61G:
|
216 |
+
global 900.0K
|
217 |
+
scoped 924.0K
|
218 |
+
HLO temp 20.61G (63.0% utilization: Unpadded (12.43G) Padded (19.71G), 4.4% fragmentation (918.84M))
|
219 |
+
Largest program allocations in hbm:
|
220 |
+
1. Size: 1.54G
|
221 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
|
222 |
+
Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
|
223 |
+
Unpadded size: 1.54G
|
224 |
+
Extra memory due to padding: 64.0K (1.0x expansion)
|
225 |
+
XLA label: %fusion.3615.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %get-tuple-element.22628, f32[768]{0:T(1024)} %fusion.10158, f32[768]{0:T(1024)} %fusion.10159, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.20129, f32[...
|
226 |
+
Allocation type: HLO temp
|
227 |
+
==========================
|
228 |
+
2. Size: 360.00M
|
229 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
230 |
+
Unpadded size: 180.00M
|
231 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
232 |
+
XLA label: %fusion.2444.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2444.remat_compressed)
|
233 |
+
Allocation type: HLO temp
|
234 |
+
==========================
|
235 |
+
3. Size: 360.00M
|
236 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
237 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
238 |
+
Unpadded size: 180.00M
|
239 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
240 |
+
XLA label: %fusion.2454.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2804, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7916, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
241 |
+
Allocation type: HLO temp
|
242 |
+
==========================
|
243 |
+
4. Size: 360.00M
|
244 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
245 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
246 |
+
Unpadded size: 180.00M
|
247 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
248 |
+
XLA label: %fusion.2453.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2803, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7915, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
249 |
+
Allocation type: HLO temp
|
250 |
+
==========================
|
251 |
+
5. Size: 360.00M
|
252 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
253 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
254 |
+
Unpadded size: 180.00M
|
255 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
256 |
+
XLA label: %fusion.2452.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2802, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7914, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
257 |
+
Allocation type: HLO temp
|
258 |
+
==========================
|
259 |
+
6. Size: 360.00M
|
260 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
261 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
262 |
+
Unpadded size: 180.00M
|
263 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
264 |
+
XLA label: %fusion.2451.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2801, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7913, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
265 |
+
Allocation type: HLO temp
|
266 |
+
==========================
|
267 |
+
7. Size: 360.00M
|
268 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
269 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
270 |
+
Unpadded size: 180.00M
|
271 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
272 |
+
XLA label: %fusion.2445 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2795, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7907, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.20342, f32[4,12,60,64,192]{3,4,2,1...
|
273 |
+
Allocation type: HLO temp
|
274 |
+
==========================
|
275 |
+
8. Size: 360.00M
|
276 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
277 |
+
Unpadded size: 180.00M
|
278 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
279 |
+
XLA label: %fusion.2443.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2443.remat_compressed)
|
280 |
+
Allocation type: HLO temp
|
281 |
+
==========================
|
282 |
+
9. Size: 360.00M
|
283 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
284 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
285 |
+
Unpadded size: 180.00M
|
286 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
287 |
+
XLA label: %fusion.2450.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2800, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7912, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
288 |
+
Allocation type: HLO temp
|
289 |
+
==========================
|
290 |
+
10. Size: 360.00M
|
291 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
292 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
293 |
+
Unpadded size: 180.00M
|
294 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
295 |
+
XLA label: %fusion.2449.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2799, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7911, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
296 |
+
Allocation type: HLO temp
|
297 |
+
==========================
|
298 |
+
11. Size: 360.00M
|
299 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
300 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
301 |
+
Unpadded size: 180.00M
|
302 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
303 |
+
XLA label: %fusion.2448.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2798, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7910, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
304 |
+
Allocation type: HLO temp
|
305 |
+
==========================
|
306 |
+
12. Size: 360.00M
|
307 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
308 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
309 |
+
Unpadded size: 180.00M
|
310 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
311 |
+
XLA label: %fusion.2447.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2797, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7909, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
312 |
+
Allocation type: HLO temp
|
313 |
+
==========================
|
314 |
+
13. Size: 360.00M
|
315 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
316 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
317 |
+
Unpadded size: 180.00M
|
318 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
319 |
+
XLA label: %fusion.2446.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2796, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7908, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
320 |
+
Allocation type: HLO temp
|
321 |
+
==========================
|
322 |
+
14. Size: 270.00M
|
323 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
324 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
325 |
+
Unpadded size: 135.00M
|
326 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
327 |
+
XLA label: %fusion.2689.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2964), kind=kO...
|
328 |
+
Allocation type: HLO temp
|
329 |
+
==========================
|
330 |
+
15. Size: 270.00M
|
331 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
332 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
333 |
+
Unpadded size: 135.00M
|
334 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
335 |
+
XLA label: %fusion.2690.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2962), kind=kO...
|
336 |
+
Allocation type: HLO temp
|
337 |
+
==========================
|
338 |
+
16. Size: 270.00M
|
339 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
340 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
341 |
+
Unpadded size: 135.00M
|
342 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
343 |
+
XLA label: %fusion.2688.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2966), kind=kO...
|
344 |
+
Allocation type: HLO temp
|
345 |
+
==========================
|
346 |
+
17. Size: 270.00M
|
347 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
348 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
349 |
+
Unpadded size: 135.00M
|
350 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
351 |
+
XLA label: %fusion.2691.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14230, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2960), kind=kO...
|
352 |
+
Allocation type: HLO temp
|
353 |
+
==========================
|
354 |
+
18. Size: 270.00M
|
355 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
356 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
357 |
+
Unpadded size: 135.00M
|
358 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
359 |
+
XLA label: %fusion.2692.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14164, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2958), kind=kO...
|
360 |
+
Allocation type: HLO temp
|
361 |
+
==========================
|
362 |
+
19. Size: 270.00M
|
363 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
364 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
365 |
+
Unpadded size: 135.00M
|
366 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
367 |
+
XLA label: %fusion.2693.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14098, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2956), kind=kO...
|
368 |
+
Allocation type: HLO temp
|
369 |
+
==========================
|
370 |
+
20. Size: 270.00M
|
371 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
372 |
+
Unpadded size: 135.00M
|
373 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
374 |
+
XLA label: %fusion.2616.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.2616.remat_compressed)
|
375 |
+
Allocation type: HLO temp
|
376 |
+
==========================
|
wandb/run-20210713_004910-3mu9pog5/files/requirements.txt
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==0.13.0
|
2 |
+
aiohttp==3.7.4.post0
|
3 |
+
astunparse==1.6.3
|
4 |
+
async-timeout==3.0.1
|
5 |
+
attrs==21.2.0
|
6 |
+
cachetools==4.2.2
|
7 |
+
certifi==2021.5.30
|
8 |
+
chardet==4.0.0
|
9 |
+
chex==0.0.8
|
10 |
+
click==8.0.1
|
11 |
+
configparser==5.0.2
|
12 |
+
cycler==0.10.0
|
13 |
+
datasets==1.9.1.dev0
|
14 |
+
dill==0.3.4
|
15 |
+
dm-tree==0.1.6
|
16 |
+
docker-pycreds==0.4.0
|
17 |
+
filelock==3.0.12
|
18 |
+
flatbuffers==1.12
|
19 |
+
flax==0.3.4
|
20 |
+
fsspec==2021.6.1
|
21 |
+
gast==0.4.0
|
22 |
+
gitdb==4.0.7
|
23 |
+
gitpython==3.1.18
|
24 |
+
google-auth-oauthlib==0.4.4
|
25 |
+
google-auth==1.32.1
|
26 |
+
google-pasta==0.2.0
|
27 |
+
grpcio==1.34.1
|
28 |
+
h5py==3.1.0
|
29 |
+
huggingface-hub==0.0.12
|
30 |
+
idna==2.10
|
31 |
+
jax==0.2.16
|
32 |
+
jaxlib==0.1.68
|
33 |
+
joblib==1.0.1
|
34 |
+
keras-nightly==2.5.0.dev2021032900
|
35 |
+
keras-preprocessing==1.1.2
|
36 |
+
kiwisolver==1.3.1
|
37 |
+
libtpu-nightly==0.1.dev20210615
|
38 |
+
markdown==3.3.4
|
39 |
+
matplotlib==3.4.2
|
40 |
+
msgpack==1.0.2
|
41 |
+
multidict==5.1.0
|
42 |
+
multiprocess==0.70.12.2
|
43 |
+
numpy==1.19.5
|
44 |
+
oauthlib==3.1.1
|
45 |
+
opt-einsum==3.3.0
|
46 |
+
optax==0.0.9
|
47 |
+
packaging==21.0
|
48 |
+
pandas==1.3.0
|
49 |
+
pathtools==0.1.2
|
50 |
+
pillow==8.3.1
|
51 |
+
pip==20.0.2
|
52 |
+
pkg-resources==0.0.0
|
53 |
+
promise==2.3
|
54 |
+
protobuf==3.17.3
|
55 |
+
psutil==5.8.0
|
56 |
+
pyarrow==4.0.1
|
57 |
+
pyasn1-modules==0.2.8
|
58 |
+
pyasn1==0.4.8
|
59 |
+
pyparsing==2.4.7
|
60 |
+
python-dateutil==2.8.1
|
61 |
+
pytz==2021.1
|
62 |
+
pyyaml==5.4.1
|
63 |
+
regex==2021.7.6
|
64 |
+
requests-oauthlib==1.3.0
|
65 |
+
requests==2.25.1
|
66 |
+
rsa==4.7.2
|
67 |
+
sacremoses==0.0.45
|
68 |
+
scipy==1.7.0
|
69 |
+
sentry-sdk==1.3.0
|
70 |
+
setuptools==44.0.0
|
71 |
+
shortuuid==1.0.1
|
72 |
+
six==1.15.0
|
73 |
+
smmap==4.0.0
|
74 |
+
subprocess32==3.5.4
|
75 |
+
tensorboard-data-server==0.6.1
|
76 |
+
tensorboard-plugin-wit==1.8.0
|
77 |
+
tensorboard==2.5.0
|
78 |
+
tensorflow-estimator==2.5.0
|
79 |
+
tensorflow==2.5.0
|
80 |
+
termcolor==1.1.0
|
81 |
+
tokenizers==0.10.3
|
82 |
+
toolz==0.11.1
|
83 |
+
tqdm==4.61.2
|
84 |
+
transformers==4.9.0.dev0
|
85 |
+
typing-extensions==3.7.4.3
|
86 |
+
urllib3==1.26.6
|
87 |
+
wandb==0.10.33
|
88 |
+
werkzeug==2.0.1
|
89 |
+
wheel==0.36.2
|
90 |
+
wrapt==1.12.1
|
91 |
+
xxhash==2.0.2
|
92 |
+
yarl==1.6.3
|
wandb/run-20210713_004910-3mu9pog5/files/wandb-metadata.json
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
|
3 |
+
"python": "3.8.10",
|
4 |
+
"heartbeatAt": "2021-07-13T00:49:12.868844",
|
5 |
+
"startedAt": "2021-07-13T00:49:10.806043",
|
6 |
+
"docker": null,
|
7 |
+
"cpu_count": 96,
|
8 |
+
"cuda": null,
|
9 |
+
"args": [
|
10 |
+
"--push_to_hub",
|
11 |
+
"--output_dir=./",
|
12 |
+
"--model_type=big_bird",
|
13 |
+
"--config_name=./",
|
14 |
+
"--tokenizer_name=./",
|
15 |
+
"--max_seq_length=4096",
|
16 |
+
"--weight_decay=0.0095",
|
17 |
+
"--warmup_steps=5000",
|
18 |
+
"--overwrite_output_dir",
|
19 |
+
"--adam_beta1=0.9",
|
20 |
+
"--adam_beta2=0.98",
|
21 |
+
"--logging_steps=500",
|
22 |
+
"--eval_steps=92768",
|
23 |
+
"--num_train_epochs=5",
|
24 |
+
"--preprocessing_num_workers=64",
|
25 |
+
"--save_steps=20000",
|
26 |
+
"--adafactor",
|
27 |
+
"--learning_rate=5e-5",
|
28 |
+
"--per_device_train_batch_size=4",
|
29 |
+
"--per_device_eval_batch_size=4",
|
30 |
+
"--save_total_limit=5",
|
31 |
+
"--dtype=bfloat16",
|
32 |
+
"--gradient_accumulation_steps=8"
|
33 |
+
],
|
34 |
+
"state": "running",
|
35 |
+
"program": "./run_mlm_flax.py",
|
36 |
+
"codePath": "run_mlm_flax.py",
|
37 |
+
"git": {
|
38 |
+
"remote": "https://huggingface.co/flax-community/pino-roberta-base",
|
39 |
+
"commit": "4229c91b780cf07115cc6d04c16e393b0d2f508c"
|
40 |
+
},
|
41 |
+
"email": null,
|
42 |
+
"root": "/home/dat/pino-roberta-base",
|
43 |
+
"host": "t1v-n-f5c06ea1-w-0",
|
44 |
+
"username": "dat",
|
45 |
+
"executable": "/home/dat/pino/bin/python"
|
46 |
+
}
|
wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{}
|
wandb/run-20210713_004910-3mu9pog5/logs/debug-internal.log
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2021-07-13 00:49:11,524 INFO MainThread:325318 [internal.py:wandb_internal():88] W&B internal server running at pid: 325318, started at: 2021-07-13 00:49:11.523864
|
2 |
+
2021-07-13 00:49:11,526 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: check_version
|
3 |
+
2021-07-13 00:49:11,526 INFO WriterThread:325318 [datastore.py:open_for_write():80] open: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/run-3mu9pog5.wandb
|
4 |
+
2021-07-13 00:49:11,527 DEBUG SenderThread:325318 [sender.py:send():179] send: header
|
5 |
+
2021-07-13 00:49:11,527 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: check_version
|
6 |
+
2021-07-13 00:49:11,564 DEBUG SenderThread:325318 [sender.py:send():179] send: run
|
7 |
+
2021-07-13 00:49:11,738 INFO SenderThread:325318 [dir_watcher.py:__init__():168] watching files in: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files
|
8 |
+
2021-07-13 00:49:11,739 INFO SenderThread:325318 [sender.py:_start_run_threads():716] run started: 3mu9pog5 with start time 1626137350
|
9 |
+
2021-07-13 00:49:11,739 DEBUG SenderThread:325318 [sender.py:send():179] send: summary
|
10 |
+
2021-07-13 00:49:11,739 INFO SenderThread:325318 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
|
11 |
+
2021-07-13 00:49:11,739 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: run_start
|
12 |
+
2021-07-13 00:49:12,741 INFO Thread-8 :325318 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json
|
13 |
+
2021-07-13 00:49:12,868 DEBUG HandlerThread:325318 [meta.py:__init__():39] meta init
|
14 |
+
2021-07-13 00:49:12,868 DEBUG HandlerThread:325318 [meta.py:__init__():53] meta init done
|
15 |
+
2021-07-13 00:49:12,868 DEBUG HandlerThread:325318 [meta.py:probe():210] probe
|
16 |
+
2021-07-13 00:49:12,870 DEBUG HandlerThread:325318 [meta.py:_setup_git():200] setup git
|
17 |
+
2021-07-13 00:49:12,899 DEBUG HandlerThread:325318 [meta.py:_setup_git():207] setup git done
|
18 |
+
2021-07-13 00:49:12,899 DEBUG HandlerThread:325318 [meta.py:_save_pip():57] save pip
|
19 |
+
2021-07-13 00:49:12,899 DEBUG HandlerThread:325318 [meta.py:_save_pip():71] save pip done
|
20 |
+
2021-07-13 00:49:12,899 DEBUG HandlerThread:325318 [meta.py:probe():252] probe done
|
21 |
+
2021-07-13 00:49:12,903 DEBUG SenderThread:325318 [sender.py:send():179] send: files
|
22 |
+
2021-07-13 00:49:12,903 INFO SenderThread:325318 [sender.py:_save_file():841] saving file wandb-metadata.json with policy now
|
23 |
+
2021-07-13 00:49:12,910 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
|
24 |
+
2021-07-13 00:49:12,911 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
|
25 |
+
2021-07-13 00:49:13,042 DEBUG SenderThread:325318 [sender.py:send():179] send: config
|
26 |
+
2021-07-13 00:49:13,043 DEBUG SenderThread:325318 [sender.py:send():179] send: config
|
27 |
+
2021-07-13 00:49:13,043 DEBUG SenderThread:325318 [sender.py:send():179] send: config
|
28 |
+
2021-07-13 00:49:13,348 INFO Thread-11 :325318 [upload_job.py:push():137] Uploaded file /tmp/tmpkvnk9e30wandb/65yetzns-wandb-metadata.json
|
29 |
+
2021-07-13 00:49:13,741 INFO Thread-8 :325318 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
|
30 |
+
2021-07-13 00:49:13,741 INFO Thread-8 :325318 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/requirements.txt
|
31 |
+
2021-07-13 00:49:13,741 INFO Thread-8 :325318 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-metadata.json
|
32 |
+
2021-07-13 00:49:28,044 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
|
33 |
+
2021-07-13 00:49:28,044 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
|
34 |
+
2021-07-13 00:49:29,748 INFO Thread-8 :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
|
35 |
+
2021-07-13 00:49:31,749 INFO Thread-8 :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
|
36 |
+
2021-07-13 00:49:40,952 DEBUG SenderThread:325318 [sender.py:send():179] send: stats
|
37 |
+
2021-07-13 00:49:42,754 INFO Thread-8 :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/config.yaml
|
38 |
+
2021-07-13 00:49:43,176 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
|
39 |
+
2021-07-13 00:49:43,177 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
|
40 |
+
2021-07-13 00:49:58,307 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
|
41 |
+
2021-07-13 00:49:58,307 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
|
42 |
+
2021-07-13 00:50:11,029 DEBUG SenderThread:325318 [sender.py:send():179] send: stats
|
43 |
+
2021-07-13 00:50:13,441 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
|
44 |
+
2021-07-13 00:50:13,442 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
|
45 |
+
2021-07-13 00:50:21,769 INFO Thread-8 :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
|
46 |
+
2021-07-13 00:50:28,590 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
|
47 |
+
2021-07-13 00:50:28,590 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
|
48 |
+
2021-07-13 00:50:41,106 DEBUG SenderThread:325318 [sender.py:send():179] send: stats
|
49 |
+
2021-07-13 00:50:43,758 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
|
50 |
+
2021-07-13 00:50:43,759 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
|
51 |
+
2021-07-13 00:50:58,908 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
|
52 |
+
2021-07-13 00:50:58,909 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
|
53 |
+
2021-07-13 00:51:11,187 DEBUG SenderThread:325318 [sender.py:send():179] send: stats
|
54 |
+
2021-07-13 00:51:14,040 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
|
55 |
+
2021-07-13 00:51:14,041 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
|
56 |
+
2021-07-13 00:51:29,172 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
|
57 |
+
2021-07-13 00:51:29,173 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
|
58 |
+
2021-07-13 00:51:41,267 DEBUG SenderThread:325318 [sender.py:send():179] send: stats
|
59 |
+
2021-07-13 00:51:44,303 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: stop_status
|
60 |
+
2021-07-13 00:51:44,304 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: stop_status
|
61 |
+
2021-07-13 00:51:53,809 INFO Thread-8 :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
|
62 |
+
2021-07-13 00:51:54,323 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
|
63 |
+
2021-07-13 00:51:54,323 DEBUG SenderThread:325318 [sender.py:send():179] send: telemetry
|
64 |
+
2021-07-13 00:51:54,323 DEBUG SenderThread:325318 [sender.py:send():179] send: exit
|
65 |
+
2021-07-13 00:51:54,323 INFO SenderThread:325318 [sender.py:send_exit():287] handling exit code: 1
|
66 |
+
2021-07-13 00:51:54,323 INFO SenderThread:325318 [sender.py:send_exit():295] send defer
|
67 |
+
2021-07-13 00:51:54,323 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
|
68 |
+
2021-07-13 00:51:54,324 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
|
69 |
+
2021-07-13 00:51:54,324 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 0
|
70 |
+
2021-07-13 00:51:54,324 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
|
71 |
+
2021-07-13 00:51:54,324 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 0
|
72 |
+
2021-07-13 00:51:54,324 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 1
|
73 |
+
2021-07-13 00:51:54,325 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
|
74 |
+
2021-07-13 00:51:54,325 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 1
|
75 |
+
2021-07-13 00:51:54,400 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
|
76 |
+
2021-07-13 00:51:54,400 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 1
|
77 |
+
2021-07-13 00:51:54,400 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 2
|
78 |
+
2021-07-13 00:51:54,401 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
|
79 |
+
2021-07-13 00:51:54,401 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 2
|
80 |
+
2021-07-13 00:51:54,401 DEBUG SenderThread:325318 [sender.py:send():179] send: stats
|
81 |
+
2021-07-13 00:51:54,401 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
|
82 |
+
2021-07-13 00:51:54,401 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 2
|
83 |
+
2021-07-13 00:51:54,401 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 3
|
84 |
+
2021-07-13 00:51:54,402 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
|
85 |
+
2021-07-13 00:51:54,402 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 3
|
86 |
+
2021-07-13 00:51:54,402 DEBUG SenderThread:325318 [sender.py:send():179] send: summary
|
87 |
+
2021-07-13 00:51:54,402 INFO SenderThread:325318 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
|
88 |
+
2021-07-13 00:51:54,403 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
|
89 |
+
2021-07-13 00:51:54,403 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 3
|
90 |
+
2021-07-13 00:51:54,403 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 4
|
91 |
+
2021-07-13 00:51:54,403 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
|
92 |
+
2021-07-13 00:51:54,403 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 4
|
93 |
+
2021-07-13 00:51:54,403 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
|
94 |
+
2021-07-13 00:51:54,403 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 4
|
95 |
+
2021-07-13 00:51:54,426 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
|
96 |
+
2021-07-13 00:51:54,590 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 5
|
97 |
+
2021-07-13 00:51:54,590 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
|
98 |
+
2021-07-13 00:51:54,591 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
|
99 |
+
2021-07-13 00:51:54,591 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 5
|
100 |
+
2021-07-13 00:51:54,591 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
|
101 |
+
2021-07-13 00:51:54,591 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 5
|
102 |
+
2021-07-13 00:51:54,591 INFO SenderThread:325318 [dir_watcher.py:finish():282] shutting down directory watcher
|
103 |
+
2021-07-13 00:51:54,693 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
|
104 |
+
2021-07-13 00:51:54,809 INFO Thread-8 :325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
|
105 |
+
2021-07-13 00:51:54,810 INFO SenderThread:325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/config.yaml
|
106 |
+
2021-07-13 00:51:54,810 INFO SenderThread:325318 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json
|
107 |
+
2021-07-13 00:51:54,810 INFO SenderThread:325318 [dir_watcher.py:finish():312] scan: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files
|
108 |
+
2021-07-13 00:51:54,810 INFO SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/requirements.txt requirements.txt
|
109 |
+
2021-07-13 00:51:54,810 INFO SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log output.log
|
110 |
+
2021-07-13 00:51:54,811 INFO SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-metadata.json wandb-metadata.json
|
111 |
+
2021-07-13 00:51:54,811 INFO SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/config.yaml config.yaml
|
112 |
+
2021-07-13 00:51:54,811 INFO SenderThread:325318 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json wandb-summary.json
|
113 |
+
2021-07-13 00:51:54,811 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 6
|
114 |
+
2021-07-13 00:51:54,811 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
|
115 |
+
2021-07-13 00:51:54,812 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
|
116 |
+
2021-07-13 00:51:54,812 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 6
|
117 |
+
2021-07-13 00:51:54,812 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
|
118 |
+
2021-07-13 00:51:54,814 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 6
|
119 |
+
2021-07-13 00:51:54,814 INFO SenderThread:325318 [file_pusher.py:finish():177] shutting down file pusher
|
120 |
+
2021-07-13 00:51:54,913 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
|
121 |
+
2021-07-13 00:51:54,914 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
|
122 |
+
2021-07-13 00:51:55,016 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
|
123 |
+
2021-07-13 00:51:55,016 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
|
124 |
+
2021-07-13 00:51:55,118 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
|
125 |
+
2021-07-13 00:51:55,118 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
|
126 |
+
2021-07-13 00:51:55,220 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
|
127 |
+
2021-07-13 00:51:55,220 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
|
128 |
+
2021-07-13 00:51:55,257 INFO Thread-14 :325318 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/config.yaml
|
129 |
+
2021-07-13 00:51:55,266 INFO Thread-12 :325318 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/requirements.txt
|
130 |
+
2021-07-13 00:51:55,277 INFO Thread-13 :325318 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/output.log
|
131 |
+
2021-07-13 00:51:55,288 INFO Thread-15 :325318 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/files/wandb-summary.json
|
132 |
+
2021-07-13 00:51:55,322 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
|
133 |
+
2021-07-13 00:51:55,322 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
|
134 |
+
2021-07-13 00:51:55,424 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
|
135 |
+
2021-07-13 00:51:55,425 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
|
136 |
+
2021-07-13 00:51:55,489 INFO Thread-7 :325318 [sender.py:transition_state():308] send defer: 7
|
137 |
+
2021-07-13 00:51:55,489 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
|
138 |
+
2021-07-13 00:51:55,489 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 7
|
139 |
+
2021-07-13 00:51:55,489 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
|
140 |
+
2021-07-13 00:51:55,490 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 7
|
141 |
+
2021-07-13 00:51:55,526 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
|
142 |
+
2021-07-13 00:51:55,771 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 8
|
143 |
+
2021-07-13 00:51:55,772 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
|
144 |
+
2021-07-13 00:51:55,772 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
|
145 |
+
2021-07-13 00:51:55,772 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 8
|
146 |
+
2021-07-13 00:51:55,772 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
|
147 |
+
2021-07-13 00:51:55,772 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 8
|
148 |
+
2021-07-13 00:51:55,773 INFO SenderThread:325318 [sender.py:transition_state():308] send defer: 9
|
149 |
+
2021-07-13 00:51:55,773 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: defer
|
150 |
+
2021-07-13 00:51:55,773 INFO HandlerThread:325318 [handler.py:handle_request_defer():141] handle defer: 9
|
151 |
+
2021-07-13 00:51:55,773 DEBUG SenderThread:325318 [sender.py:send():179] send: final
|
152 |
+
2021-07-13 00:51:55,773 DEBUG SenderThread:325318 [sender.py:send():179] send: footer
|
153 |
+
2021-07-13 00:51:55,773 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: defer
|
154 |
+
2021-07-13 00:51:55,773 INFO SenderThread:325318 [sender.py:send_request_defer():304] handle sender defer: 9
|
155 |
+
2021-07-13 00:51:55,874 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: poll_exit
|
156 |
+
2021-07-13 00:51:55,874 DEBUG SenderThread:325318 [sender.py:send_request():193] send_request: poll_exit
|
157 |
+
2021-07-13 00:51:55,874 INFO SenderThread:325318 [file_pusher.py:join():182] waiting for file pusher
|
158 |
+
2021-07-13 00:51:55,876 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: get_summary
|
159 |
+
2021-07-13 00:51:55,877 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: sampled_history
|
160 |
+
2021-07-13 00:51:55,877 DEBUG HandlerThread:325318 [handler.py:handle_request():124] handle_request: shutdown
|
161 |
+
2021-07-13 00:51:55,877 INFO HandlerThread:325318 [handler.py:finish():638] shutting down handler
|
162 |
+
2021-07-13 00:51:56,774 INFO WriterThread:325318 [datastore.py:close():288] close: /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/run-3mu9pog5.wandb
|
163 |
+
2021-07-13 00:51:56,875 INFO SenderThread:325318 [sender.py:finish():945] shutting down sender
|
164 |
+
2021-07-13 00:51:56,875 INFO SenderThread:325318 [file_pusher.py:finish():177] shutting down file pusher
|
165 |
+
2021-07-13 00:51:56,875 INFO SenderThread:325318 [file_pusher.py:join():182] waiting for file pusher
|
166 |
+
2021-07-13 00:51:56,877 INFO MainThread:325318 [internal.py:handle_exit():78] Internal process exited
|
wandb/run-20210713_004910-3mu9pog5/logs/debug.log
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2021-07-13 00:49:10,807 INFO MainThread:323744 [wandb_setup.py:_flush():69] setting env: {}
|
2 |
+
2021-07-13 00:49:10,807 INFO MainThread:323744 [wandb_setup.py:_flush():69] setting login settings: {}
|
3 |
+
2021-07-13 00:49:10,807 INFO MainThread:323744 [wandb_init.py:_log_setup():337] Logging user logs to /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/logs/debug.log
|
4 |
+
2021-07-13 00:49:10,807 INFO MainThread:323744 [wandb_init.py:_log_setup():338] Logging internal logs to /home/dat/pino-roberta-base/wandb/run-20210713_004910-3mu9pog5/logs/debug-internal.log
|
5 |
+
2021-07-13 00:49:10,808 INFO MainThread:323744 [wandb_init.py:init():370] calling init triggers
|
6 |
+
2021-07-13 00:49:10,808 INFO MainThread:323744 [wandb_init.py:init():375] wandb.init called with sweep_config: {}
|
7 |
+
config: {}
|
8 |
+
2021-07-13 00:49:10,808 INFO MainThread:323744 [wandb_init.py:init():419] starting backend
|
9 |
+
2021-07-13 00:49:10,808 INFO MainThread:323744 [backend.py:_multiprocessing_setup():70] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
10 |
+
2021-07-13 00:49:10,863 INFO MainThread:323744 [backend.py:ensure_launched():135] starting backend process...
|
11 |
+
2021-07-13 00:49:10,917 INFO MainThread:323744 [backend.py:ensure_launched():139] started backend process with pid: 325318
|
12 |
+
2021-07-13 00:49:10,919 INFO MainThread:323744 [wandb_init.py:init():424] backend started and connected
|
13 |
+
2021-07-13 00:49:10,923 INFO MainThread:323744 [wandb_init.py:init():472] updated telemetry
|
14 |
+
2021-07-13 00:49:10,924 INFO MainThread:323744 [wandb_init.py:init():491] communicating current version
|
15 |
+
2021-07-13 00:49:11,562 INFO MainThread:323744 [wandb_init.py:init():496] got version response
|
16 |
+
2021-07-13 00:49:11,563 INFO MainThread:323744 [wandb_init.py:init():504] communicating run to backend with 30 second timeout
|
17 |
+
2021-07-13 00:49:11,739 INFO MainThread:323744 [wandb_init.py:init():529] starting run threads in backend
|
18 |
+
2021-07-13 00:49:12,907 INFO MainThread:323744 [wandb_run.py:_console_start():1623] atexit reg
|
19 |
+
2021-07-13 00:49:12,907 INFO MainThread:323744 [wandb_run.py:_redirect():1497] redirect: SettingsConsole.REDIRECT
|
20 |
+
2021-07-13 00:49:12,908 INFO MainThread:323744 [wandb_run.py:_redirect():1502] Redirecting console.
|
21 |
+
2021-07-13 00:49:12,910 INFO MainThread:323744 [wandb_run.py:_redirect():1558] Redirects installed.
|
22 |
+
2021-07-13 00:49:12,910 INFO MainThread:323744 [wandb_init.py:init():554] run started, returning control to user process
|
23 |
+
2021-07-13 00:49:12,916 INFO MainThread:323744 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 5000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul13_00-48-19_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 500, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 20000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 92768, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': True, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''}
|
24 |
+
2021-07-13 00:49:12,917 INFO MainThread:323744 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
|
25 |
+
2021-07-13 00:49:12,919 INFO MainThread:323744 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 64, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False}
|
26 |
+
2021-07-13 00:51:51,794 INFO MainThread:323744 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 1
|
27 |
+
2021-07-13 00:51:51,796 INFO MainThread:323744 [wandb_run.py:_restore():1565] restore
|
28 |
+
2021-07-13 00:51:54,324 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
29 |
+
wandb_count: 1
|
30 |
+
}
|
31 |
+
pusher_stats {
|
32 |
+
uploaded_bytes: 1417
|
33 |
+
total_bytes: 1417
|
34 |
+
}
|
35 |
+
|
36 |
+
2021-07-13 00:51:54,591 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
37 |
+
wandb_count: 1
|
38 |
+
}
|
39 |
+
pusher_stats {
|
40 |
+
uploaded_bytes: 1417
|
41 |
+
total_bytes: 1417
|
42 |
+
}
|
43 |
+
|
44 |
+
2021-07-13 00:51:54,812 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
45 |
+
wandb_count: 4
|
46 |
+
}
|
47 |
+
pusher_stats {
|
48 |
+
uploaded_bytes: 1417
|
49 |
+
total_bytes: 40394
|
50 |
+
}
|
51 |
+
|
52 |
+
2021-07-13 00:51:54,915 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
53 |
+
wandb_count: 5
|
54 |
+
}
|
55 |
+
pusher_stats {
|
56 |
+
uploaded_bytes: 1417
|
57 |
+
total_bytes: 40396
|
58 |
+
}
|
59 |
+
|
60 |
+
2021-07-13 00:51:55,017 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
61 |
+
wandb_count: 5
|
62 |
+
}
|
63 |
+
pusher_stats {
|
64 |
+
uploaded_bytes: 40396
|
65 |
+
total_bytes: 40396
|
66 |
+
}
|
67 |
+
|
68 |
+
2021-07-13 00:51:55,119 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
69 |
+
wandb_count: 5
|
70 |
+
}
|
71 |
+
pusher_stats {
|
72 |
+
uploaded_bytes: 40396
|
73 |
+
total_bytes: 40396
|
74 |
+
}
|
75 |
+
|
76 |
+
2021-07-13 00:51:55,221 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
77 |
+
wandb_count: 5
|
78 |
+
}
|
79 |
+
pusher_stats {
|
80 |
+
uploaded_bytes: 40396
|
81 |
+
total_bytes: 40396
|
82 |
+
}
|
83 |
+
|
84 |
+
2021-07-13 00:51:55,323 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
85 |
+
wandb_count: 5
|
86 |
+
}
|
87 |
+
pusher_stats {
|
88 |
+
uploaded_bytes: 40396
|
89 |
+
total_bytes: 40396
|
90 |
+
}
|
91 |
+
|
92 |
+
2021-07-13 00:51:55,425 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
93 |
+
wandb_count: 5
|
94 |
+
}
|
95 |
+
pusher_stats {
|
96 |
+
uploaded_bytes: 40396
|
97 |
+
total_bytes: 40396
|
98 |
+
}
|
99 |
+
|
100 |
+
2021-07-13 00:51:55,772 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
101 |
+
wandb_count: 5
|
102 |
+
}
|
103 |
+
pusher_stats {
|
104 |
+
uploaded_bytes: 40396
|
105 |
+
total_bytes: 40396
|
106 |
+
}
|
107 |
+
|
108 |
+
2021-07-13 00:51:55,875 INFO MainThread:323744 [wandb_run.py:_wait_for_finish():1715] got exit ret: done: true
|
109 |
+
exit_result {
|
110 |
+
}
|
111 |
+
file_counts {
|
112 |
+
wandb_count: 5
|
113 |
+
}
|
114 |
+
pusher_stats {
|
115 |
+
uploaded_bytes: 40396
|
116 |
+
total_bytes: 40396
|
117 |
+
}
|
118 |
+
|
119 |
+
2021-07-13 00:51:57,265 INFO MainThread:323744 [wandb_run.py:_show_files():1937] logging synced files
|
wandb/run-20210713_004910-3mu9pog5/run-3mu9pog5.wandb
ADDED
Binary file (37.4 kB). View file
|
|
wandb/run-20210713_005301-2ilkub1o/files/config.yaml
ADDED
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
_wandb:
|
4 |
+
desc: null
|
5 |
+
value:
|
6 |
+
cli_version: 0.10.33
|
7 |
+
framework: huggingface
|
8 |
+
huggingface_version: 4.9.0.dev0
|
9 |
+
is_jupyter_run: false
|
10 |
+
is_kaggle_kernel: false
|
11 |
+
python_version: 3.8.10
|
12 |
+
t:
|
13 |
+
1:
|
14 |
+
- 3
|
15 |
+
- 11
|
16 |
+
2:
|
17 |
+
- 3
|
18 |
+
- 11
|
19 |
+
4: 3.8.10
|
20 |
+
5: 0.10.33
|
21 |
+
6: 4.9.0.dev0
|
22 |
+
8:
|
23 |
+
- 5
|
24 |
+
adafactor:
|
25 |
+
desc: null
|
26 |
+
value: true
|
27 |
+
adam_beta1:
|
28 |
+
desc: null
|
29 |
+
value: 0.9
|
30 |
+
adam_beta2:
|
31 |
+
desc: null
|
32 |
+
value: 0.98
|
33 |
+
adam_epsilon:
|
34 |
+
desc: null
|
35 |
+
value: 1.0e-08
|
36 |
+
cache_dir:
|
37 |
+
desc: null
|
38 |
+
value: null
|
39 |
+
config_name:
|
40 |
+
desc: null
|
41 |
+
value: ./
|
42 |
+
dataloader_drop_last:
|
43 |
+
desc: null
|
44 |
+
value: false
|
45 |
+
dataloader_num_workers:
|
46 |
+
desc: null
|
47 |
+
value: 0
|
48 |
+
dataloader_pin_memory:
|
49 |
+
desc: null
|
50 |
+
value: true
|
51 |
+
dataset_config_name:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
dataset_name:
|
55 |
+
desc: null
|
56 |
+
value: null
|
57 |
+
ddp_find_unused_parameters:
|
58 |
+
desc: null
|
59 |
+
value: null
|
60 |
+
debug:
|
61 |
+
desc: null
|
62 |
+
value: []
|
63 |
+
deepspeed:
|
64 |
+
desc: null
|
65 |
+
value: null
|
66 |
+
disable_tqdm:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
do_eval:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
do_predict:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
do_train:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
dtype:
|
79 |
+
desc: null
|
80 |
+
value: bfloat16
|
81 |
+
eval_accumulation_steps:
|
82 |
+
desc: null
|
83 |
+
value: null
|
84 |
+
eval_steps:
|
85 |
+
desc: null
|
86 |
+
value: 92768
|
87 |
+
evaluation_strategy:
|
88 |
+
desc: null
|
89 |
+
value: IntervalStrategy.NO
|
90 |
+
fp16:
|
91 |
+
desc: null
|
92 |
+
value: false
|
93 |
+
fp16_backend:
|
94 |
+
desc: null
|
95 |
+
value: auto
|
96 |
+
fp16_full_eval:
|
97 |
+
desc: null
|
98 |
+
value: false
|
99 |
+
fp16_opt_level:
|
100 |
+
desc: null
|
101 |
+
value: O1
|
102 |
+
gradient_accumulation_steps:
|
103 |
+
desc: null
|
104 |
+
value: 8
|
105 |
+
greater_is_better:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
group_by_length:
|
109 |
+
desc: null
|
110 |
+
value: false
|
111 |
+
ignore_data_skip:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
label_names:
|
115 |
+
desc: null
|
116 |
+
value: null
|
117 |
+
label_smoothing_factor:
|
118 |
+
desc: null
|
119 |
+
value: 0.0
|
120 |
+
learning_rate:
|
121 |
+
desc: null
|
122 |
+
value: 5.0e-05
|
123 |
+
length_column_name:
|
124 |
+
desc: null
|
125 |
+
value: length
|
126 |
+
line_by_line:
|
127 |
+
desc: null
|
128 |
+
value: false
|
129 |
+
load_best_model_at_end:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
local_rank:
|
133 |
+
desc: null
|
134 |
+
value: -1
|
135 |
+
log_level:
|
136 |
+
desc: null
|
137 |
+
value: -1
|
138 |
+
log_level_replica:
|
139 |
+
desc: null
|
140 |
+
value: -1
|
141 |
+
log_on_each_node:
|
142 |
+
desc: null
|
143 |
+
value: true
|
144 |
+
logging_dir:
|
145 |
+
desc: null
|
146 |
+
value: ./runs/Jul13_00-52-13_t1v-n-f5c06ea1-w-0
|
147 |
+
logging_first_step:
|
148 |
+
desc: null
|
149 |
+
value: false
|
150 |
+
logging_steps:
|
151 |
+
desc: null
|
152 |
+
value: 500
|
153 |
+
logging_strategy:
|
154 |
+
desc: null
|
155 |
+
value: IntervalStrategy.STEPS
|
156 |
+
lr_scheduler_type:
|
157 |
+
desc: null
|
158 |
+
value: SchedulerType.LINEAR
|
159 |
+
max_grad_norm:
|
160 |
+
desc: null
|
161 |
+
value: 1.0
|
162 |
+
max_seq_length:
|
163 |
+
desc: null
|
164 |
+
value: 4096
|
165 |
+
max_steps:
|
166 |
+
desc: null
|
167 |
+
value: -1
|
168 |
+
metric_for_best_model:
|
169 |
+
desc: null
|
170 |
+
value: null
|
171 |
+
mlm_probability:
|
172 |
+
desc: null
|
173 |
+
value: 0.15
|
174 |
+
model_name_or_path:
|
175 |
+
desc: null
|
176 |
+
value: null
|
177 |
+
model_type:
|
178 |
+
desc: null
|
179 |
+
value: big_bird
|
180 |
+
mp_parameters:
|
181 |
+
desc: null
|
182 |
+
value: ''
|
183 |
+
no_cuda:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
num_train_epochs:
|
187 |
+
desc: null
|
188 |
+
value: 5.0
|
189 |
+
output_dir:
|
190 |
+
desc: null
|
191 |
+
value: ./
|
192 |
+
overwrite_cache:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
overwrite_output_dir:
|
196 |
+
desc: null
|
197 |
+
value: true
|
198 |
+
pad_to_max_length:
|
199 |
+
desc: null
|
200 |
+
value: false
|
201 |
+
past_index:
|
202 |
+
desc: null
|
203 |
+
value: -1
|
204 |
+
per_device_eval_batch_size:
|
205 |
+
desc: null
|
206 |
+
value: 4
|
207 |
+
per_device_train_batch_size:
|
208 |
+
desc: null
|
209 |
+
value: 4
|
210 |
+
per_gpu_eval_batch_size:
|
211 |
+
desc: null
|
212 |
+
value: null
|
213 |
+
per_gpu_train_batch_size:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
prediction_loss_only:
|
217 |
+
desc: null
|
218 |
+
value: false
|
219 |
+
preprocessing_num_workers:
|
220 |
+
desc: null
|
221 |
+
value: 64
|
222 |
+
push_to_hub:
|
223 |
+
desc: null
|
224 |
+
value: true
|
225 |
+
push_to_hub_model_id:
|
226 |
+
desc: null
|
227 |
+
value: ''
|
228 |
+
push_to_hub_organization:
|
229 |
+
desc: null
|
230 |
+
value: null
|
231 |
+
push_to_hub_token:
|
232 |
+
desc: null
|
233 |
+
value: null
|
234 |
+
remove_unused_columns:
|
235 |
+
desc: null
|
236 |
+
value: true
|
237 |
+
report_to:
|
238 |
+
desc: null
|
239 |
+
value:
|
240 |
+
- tensorboard
|
241 |
+
- wandb
|
242 |
+
resume_from_checkpoint:
|
243 |
+
desc: null
|
244 |
+
value: null
|
245 |
+
run_name:
|
246 |
+
desc: null
|
247 |
+
value: ./
|
248 |
+
save_on_each_node:
|
249 |
+
desc: null
|
250 |
+
value: false
|
251 |
+
save_steps:
|
252 |
+
desc: null
|
253 |
+
value: 20000
|
254 |
+
save_strategy:
|
255 |
+
desc: null
|
256 |
+
value: IntervalStrategy.STEPS
|
257 |
+
save_total_limit:
|
258 |
+
desc: null
|
259 |
+
value: 5
|
260 |
+
seed:
|
261 |
+
desc: null
|
262 |
+
value: 42
|
263 |
+
sharded_ddp:
|
264 |
+
desc: null
|
265 |
+
value: []
|
266 |
+
skip_memory_metrics:
|
267 |
+
desc: null
|
268 |
+
value: true
|
269 |
+
tokenizer_name:
|
270 |
+
desc: null
|
271 |
+
value: ./
|
272 |
+
tpu_metrics_debug:
|
273 |
+
desc: null
|
274 |
+
value: false
|
275 |
+
tpu_num_cores:
|
276 |
+
desc: null
|
277 |
+
value: null
|
278 |
+
train_file:
|
279 |
+
desc: null
|
280 |
+
value: null
|
281 |
+
train_ref_file:
|
282 |
+
desc: null
|
283 |
+
value: null
|
284 |
+
use_fast_tokenizer:
|
285 |
+
desc: null
|
286 |
+
value: true
|
287 |
+
use_legacy_prediction_loop:
|
288 |
+
desc: null
|
289 |
+
value: false
|
290 |
+
validation_file:
|
291 |
+
desc: null
|
292 |
+
value: null
|
293 |
+
validation_ref_file:
|
294 |
+
desc: null
|
295 |
+
value: null
|
296 |
+
validation_split_percentage:
|
297 |
+
desc: null
|
298 |
+
value: 5
|
299 |
+
warmup_ratio:
|
300 |
+
desc: null
|
301 |
+
value: 0.0
|
302 |
+
warmup_steps:
|
303 |
+
desc: null
|
304 |
+
value: 5000
|
305 |
+
weight_decay:
|
306 |
+
desc: null
|
307 |
+
value: 0.0095
|
wandb/run-20210713_005301-2ilkub1o/files/output.log
ADDED
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/home/dat/pino/lib/python3.8/site-packages/jax/_src/numpy/lax_numpy.py:3114: UserWarning: Explicitly requested dtype <class 'jax._src.numpy.lax_numpy.int64'> requested in zeros is not available, and will be truncated to dtype int32. To enable more dtypes, set the jax_enable_x64 configuration option or the JAX_ENABLE_X64 shell environment variable. See https://github.com/google/jax#current-gotchas for more.
|
2 |
+
lax._check_user_dtype_supported(dtype, "zeros")
|
3 |
+
/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:382: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
|
4 |
+
warnings.warn(
|
5 |
+
/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:369: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.
|
6 |
+
warnings.warn(
|
7 |
+
Epoch ... (1/5): 0%| | 0/5 [00:00<?, ?it/s]
|
8 |
+
Epoch ... (1/5): 0%| | 0/5 [02:23<?, ?it/s]
|
9 |
+
Traceback (most recent call last):
|
10 |
+
File "./run_mlm_flax.py", line 709, in <module>
|
11 |
+
state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
|
12 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
|
13 |
+
return fun(*args, **kwargs)
|
14 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/api.py", line 1647, in f_pmapped
|
15 |
+
out = pxla.xla_pmap(
|
16 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1620, in bind
|
17 |
+
return call_bind(self, fun, *args, **params)
|
18 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1551, in call_bind
|
19 |
+
outs = primitive.process(top_trace, fun, tracers, params)
|
20 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1623, in process
|
21 |
+
return trace.process_map(self, fun, tracers, params)
|
22 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 606, in process_call
|
23 |
+
return primitive.impl(f, *tracers, **params)
|
24 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 624, in xla_pmap_impl
|
25 |
+
compiled_fun, fingerprint = parallel_callable(fun, backend, axis_name, axis_size,
|
26 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/linear_util.py", line 262, in memoized_fun
|
27 |
+
ans = call(fun, *args)
|
28 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 899, in parallel_callable
|
29 |
+
compiled = xla.backend_compile(backend, built, compile_options)
|
30 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
|
31 |
+
return backend.compile(built_c, compile_options=options)
|
32 |
+
jax._src.traceback_util.UnfilteredStackTrace: RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.61G of 15.48G hbm. Exceeded hbm capacity by 5.13G.
|
33 |
+
Total hbm usage >= 21.13G:
|
34 |
+
reserved 530.00M
|
35 |
+
program 20.61G
|
36 |
+
arguments 0B
|
37 |
+
Output size 0B; shares 0B with arguments.
|
38 |
+
Program hbm requirement 20.61G:
|
39 |
+
global 900.0K
|
40 |
+
scoped 924.0K
|
41 |
+
HLO temp 20.61G (63.0% utilization: Unpadded (12.43G) Padded (19.71G), 4.4% fragmentation (918.84M))
|
42 |
+
Largest program allocations in hbm:
|
43 |
+
1. Size: 1.54G
|
44 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
|
45 |
+
Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
|
46 |
+
Unpadded size: 1.54G
|
47 |
+
Extra memory due to padding: 64.0K (1.0x expansion)
|
48 |
+
XLA label: %fusion.3615.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %get-tuple-element.22628, f32[768]{0:T(1024)} %fusion.10158, f32[768]{0:T(1024)} %fusion.10159, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.20129, f32[...
|
49 |
+
Allocation type: HLO temp
|
50 |
+
==========================
|
51 |
+
2. Size: 360.00M
|
52 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
53 |
+
Unpadded size: 180.00M
|
54 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
55 |
+
XLA label: %fusion.2444.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2444.remat_compressed)
|
56 |
+
Allocation type: HLO temp
|
57 |
+
==========================
|
58 |
+
3. Size: 360.00M
|
59 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
60 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
61 |
+
Unpadded size: 180.00M
|
62 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
63 |
+
XLA label: %fusion.2454.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2804, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7916, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
64 |
+
Allocation type: HLO temp
|
65 |
+
==========================
|
66 |
+
4. Size: 360.00M
|
67 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
68 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
69 |
+
Unpadded size: 180.00M
|
70 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
71 |
+
XLA label: %fusion.2453.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2803, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7915, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
72 |
+
Allocation type: HLO temp
|
73 |
+
==========================
|
74 |
+
5. Size: 360.00M
|
75 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
76 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
77 |
+
Unpadded size: 180.00M
|
78 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
79 |
+
XLA label: %fusion.2452.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2802, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7914, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
80 |
+
Allocation type: HLO temp
|
81 |
+
==========================
|
82 |
+
6. Size: 360.00M
|
83 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
84 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
85 |
+
Unpadded size: 180.00M
|
86 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
87 |
+
XLA label: %fusion.2451.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2801, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7913, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
88 |
+
Allocation type: HLO temp
|
89 |
+
==========================
|
90 |
+
7. Size: 360.00M
|
91 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
92 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
93 |
+
Unpadded size: 180.00M
|
94 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
95 |
+
XLA label: %fusion.2445 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2795, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7907, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.20342, f32[4,12,60,64,192]{3,4,2,1...
|
96 |
+
Allocation type: HLO temp
|
97 |
+
==========================
|
98 |
+
8. Size: 360.00M
|
99 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
100 |
+
Unpadded size: 180.00M
|
101 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
102 |
+
XLA label: %fusion.2443.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2443.remat_compressed)
|
103 |
+
Allocation type: HLO temp
|
104 |
+
==========================
|
105 |
+
9. Size: 360.00M
|
106 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
107 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
108 |
+
Unpadded size: 180.00M
|
109 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
110 |
+
XLA label: %fusion.2450.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2800, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7912, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
111 |
+
Allocation type: HLO temp
|
112 |
+
==========================
|
113 |
+
10. Size: 360.00M
|
114 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
115 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
116 |
+
Unpadded size: 180.00M
|
117 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
118 |
+
XLA label: %fusion.2449.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2799, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7911, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
119 |
+
Allocation type: HLO temp
|
120 |
+
==========================
|
121 |
+
11. Size: 360.00M
|
122 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
123 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
124 |
+
Unpadded size: 180.00M
|
125 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
126 |
+
XLA label: %fusion.2448.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2798, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7910, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
127 |
+
Allocation type: HLO temp
|
128 |
+
==========================
|
129 |
+
12. Size: 360.00M
|
130 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
131 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
132 |
+
Unpadded size: 180.00M
|
133 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
134 |
+
XLA label: %fusion.2447.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2797, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7909, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
135 |
+
Allocation type: HLO temp
|
136 |
+
==========================
|
137 |
+
13. Size: 360.00M
|
138 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
139 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
140 |
+
Unpadded size: 180.00M
|
141 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
142 |
+
XLA label: %fusion.2446.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2796, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7908, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
143 |
+
Allocation type: HLO temp
|
144 |
+
==========================
|
145 |
+
14. Size: 270.00M
|
146 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
147 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
148 |
+
Unpadded size: 135.00M
|
149 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
150 |
+
XLA label: %fusion.2689.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2964), kind=kO...
|
151 |
+
Allocation type: HLO temp
|
152 |
+
==========================
|
153 |
+
15. Size: 270.00M
|
154 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
155 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
156 |
+
Unpadded size: 135.00M
|
157 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
158 |
+
XLA label: %fusion.2690.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2962), kind=kO...
|
159 |
+
Allocation type: HLO temp
|
160 |
+
==========================
|
161 |
+
16. Size: 270.00M
|
162 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
163 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
164 |
+
Unpadded size: 135.00M
|
165 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
166 |
+
XLA label: %fusion.2688.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2966), kind=kO...
|
167 |
+
Allocation type: HLO temp
|
168 |
+
==========================
|
169 |
+
17. Size: 270.00M
|
170 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
171 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
172 |
+
Unpadded size: 135.00M
|
173 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
174 |
+
XLA label: %fusion.2691.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14230, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2960), kind=kO...
|
175 |
+
Allocation type: HLO temp
|
176 |
+
==========================
|
177 |
+
18. Size: 270.00M
|
178 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
179 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
180 |
+
Unpadded size: 135.00M
|
181 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
182 |
+
XLA label: %fusion.2692.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14164, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2958), kind=kO...
|
183 |
+
Allocation type: HLO temp
|
184 |
+
==========================
|
185 |
+
19. Size: 270.00M
|
186 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
187 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
188 |
+
Unpadded size: 135.00M
|
189 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
190 |
+
XLA label: %fusion.2693.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14098, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2956), kind=kO...
|
191 |
+
Allocation type: HLO temp
|
192 |
+
==========================
|
193 |
+
20. Size: 270.00M
|
194 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
195 |
+
Unpadded size: 135.00M
|
196 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
197 |
+
XLA label: %fusion.2616.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.2616.remat_compressed)
|
198 |
+
Allocation type: HLO temp
|
199 |
+
==========================
|
200 |
+
The stack trace below excludes JAX-internal frames.
|
201 |
+
The preceding is the original exception that occurred, unmodified.
|
202 |
+
--------------------
|
203 |
+
The above exception was the direct cause of the following exception:
|
204 |
+
Traceback (most recent call last):
|
205 |
+
File "./run_mlm_flax.py", line 709, in <module>
|
206 |
+
state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
|
207 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
|
208 |
+
return backend.compile(built_c, compile_options=options)
|
209 |
+
RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.61G of 15.48G hbm. Exceeded hbm capacity by 5.13G.
|
210 |
+
Total hbm usage >= 21.13G:
|
211 |
+
reserved 530.00M
|
212 |
+
program 20.61G
|
213 |
+
arguments 0B
|
214 |
+
Output size 0B; shares 0B with arguments.
|
215 |
+
Program hbm requirement 20.61G:
|
216 |
+
global 900.0K
|
217 |
+
scoped 924.0K
|
218 |
+
HLO temp 20.61G (63.0% utilization: Unpadded (12.43G) Padded (19.71G), 4.4% fragmentation (918.84M))
|
219 |
+
Largest program allocations in hbm:
|
220 |
+
1. Size: 1.54G
|
221 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
|
222 |
+
Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
|
223 |
+
Unpadded size: 1.54G
|
224 |
+
Extra memory due to padding: 64.0K (1.0x expansion)
|
225 |
+
XLA label: %fusion.3615.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %get-tuple-element.22628, f32[768]{0:T(1024)} %fusion.10158, f32[768]{0:T(1024)} %fusion.10159, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.20129, f32[...
|
226 |
+
Allocation type: HLO temp
|
227 |
+
==========================
|
228 |
+
2. Size: 360.00M
|
229 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
230 |
+
Unpadded size: 180.00M
|
231 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
232 |
+
XLA label: %fusion.2444.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2444.remat_compressed)
|
233 |
+
Allocation type: HLO temp
|
234 |
+
==========================
|
235 |
+
3. Size: 360.00M
|
236 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
237 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
238 |
+
Unpadded size: 180.00M
|
239 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
240 |
+
XLA label: %fusion.2454.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2804, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7916, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
241 |
+
Allocation type: HLO temp
|
242 |
+
==========================
|
243 |
+
4. Size: 360.00M
|
244 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
245 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
246 |
+
Unpadded size: 180.00M
|
247 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
248 |
+
XLA label: %fusion.2453.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2803, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7915, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
249 |
+
Allocation type: HLO temp
|
250 |
+
==========================
|
251 |
+
5. Size: 360.00M
|
252 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
253 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
254 |
+
Unpadded size: 180.00M
|
255 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
256 |
+
XLA label: %fusion.2452.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2802, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7914, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
257 |
+
Allocation type: HLO temp
|
258 |
+
==========================
|
259 |
+
6. Size: 360.00M
|
260 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
261 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
262 |
+
Unpadded size: 180.00M
|
263 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
264 |
+
XLA label: %fusion.2451.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2801, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7913, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
265 |
+
Allocation type: HLO temp
|
266 |
+
==========================
|
267 |
+
7. Size: 360.00M
|
268 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
269 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
270 |
+
Unpadded size: 180.00M
|
271 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
272 |
+
XLA label: %fusion.2445 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2795, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7907, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.20342, f32[4,12,60,64,192]{3,4,2,1...
|
273 |
+
Allocation type: HLO temp
|
274 |
+
==========================
|
275 |
+
8. Size: 360.00M
|
276 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
277 |
+
Unpadded size: 180.00M
|
278 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
279 |
+
XLA label: %fusion.2443.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.2443.remat_compressed)
|
280 |
+
Allocation type: HLO temp
|
281 |
+
==========================
|
282 |
+
9. Size: 360.00M
|
283 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
284 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
285 |
+
Unpadded size: 180.00M
|
286 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
287 |
+
XLA label: %fusion.2450.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2800, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7912, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
288 |
+
Allocation type: HLO temp
|
289 |
+
==========================
|
290 |
+
10. Size: 360.00M
|
291 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
292 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
293 |
+
Unpadded size: 180.00M
|
294 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
295 |
+
XLA label: %fusion.2449.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2799, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7911, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
296 |
+
Allocation type: HLO temp
|
297 |
+
==========================
|
298 |
+
11. Size: 360.00M
|
299 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
300 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
301 |
+
Unpadded size: 180.00M
|
302 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
303 |
+
XLA label: %fusion.2448.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2798, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7910, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
304 |
+
Allocation type: HLO temp
|
305 |
+
==========================
|
306 |
+
12. Size: 360.00M
|
307 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
308 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
309 |
+
Unpadded size: 180.00M
|
310 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
311 |
+
XLA label: %fusion.2447.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2797, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7909, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
312 |
+
Allocation type: HLO temp
|
313 |
+
==========================
|
314 |
+
13. Size: 360.00M
|
315 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
316 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
317 |
+
Unpadded size: 180.00M
|
318 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
319 |
+
XLA label: %fusion.2446.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.2796, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.7908, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)...
|
320 |
+
Allocation type: HLO temp
|
321 |
+
==========================
|
322 |
+
14. Size: 270.00M
|
323 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
324 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
325 |
+
Unpadded size: 135.00M
|
326 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
327 |
+
XLA label: %fusion.2689.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2964), kind=kO...
|
328 |
+
Allocation type: HLO temp
|
329 |
+
==========================
|
330 |
+
15. Size: 270.00M
|
331 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
332 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
333 |
+
Unpadded size: 135.00M
|
334 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
335 |
+
XLA label: %fusion.2690.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2962), kind=kO...
|
336 |
+
Allocation type: HLO temp
|
337 |
+
==========================
|
338 |
+
16. Size: 270.00M
|
339 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
340 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
341 |
+
Unpadded size: 135.00M
|
342 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
343 |
+
XLA label: %fusion.2688.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2966), kind=kO...
|
344 |
+
Allocation type: HLO temp
|
345 |
+
==========================
|
346 |
+
17. Size: 270.00M
|
347 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
348 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
349 |
+
Unpadded size: 135.00M
|
350 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
351 |
+
XLA label: %fusion.2691.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14230, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2960), kind=kO...
|
352 |
+
Allocation type: HLO temp
|
353 |
+
==========================
|
354 |
+
18. Size: 270.00M
|
355 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
356 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
357 |
+
Unpadded size: 135.00M
|
358 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
359 |
+
XLA label: %fusion.2692.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14164, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2958), kind=kO...
|
360 |
+
Allocation type: HLO temp
|
361 |
+
==========================
|
362 |
+
19. Size: 270.00M
|
363 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
364 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
365 |
+
Unpadded size: 135.00M
|
366 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
367 |
+
XLA label: %fusion.2693.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.20556, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14098, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.2956), kind=kO...
|
368 |
+
Allocation type: HLO temp
|
369 |
+
==========================
|
370 |
+
20. Size: 270.00M
|
371 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
372 |
+
Unpadded size: 135.00M
|
373 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
374 |
+
XLA label: %fusion.2616.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.2616.remat_compressed)
|
375 |
+
Allocation type: HLO temp
|
376 |
+
==========================
|
wandb/run-20210713_005301-2ilkub1o/files/requirements.txt
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==0.13.0
|
2 |
+
aiohttp==3.7.4.post0
|
3 |
+
astunparse==1.6.3
|
4 |
+
async-timeout==3.0.1
|
5 |
+
attrs==21.2.0
|
6 |
+
cachetools==4.2.2
|
7 |
+
certifi==2021.5.30
|
8 |
+
chardet==4.0.0
|
9 |
+
chex==0.0.8
|
10 |
+
click==8.0.1
|
11 |
+
configparser==5.0.2
|
12 |
+
cycler==0.10.0
|
13 |
+
datasets==1.9.1.dev0
|
14 |
+
dill==0.3.4
|
15 |
+
dm-tree==0.1.6
|
16 |
+
docker-pycreds==0.4.0
|
17 |
+
filelock==3.0.12
|
18 |
+
flatbuffers==1.12
|
19 |
+
flax==0.3.4
|
20 |
+
fsspec==2021.6.1
|
21 |
+
gast==0.4.0
|
22 |
+
gitdb==4.0.7
|
23 |
+
gitpython==3.1.18
|
24 |
+
google-auth-oauthlib==0.4.4
|
25 |
+
google-auth==1.32.1
|
26 |
+
google-pasta==0.2.0
|
27 |
+
grpcio==1.34.1
|
28 |
+
h5py==3.1.0
|
29 |
+
huggingface-hub==0.0.12
|
30 |
+
idna==2.10
|
31 |
+
jax==0.2.16
|
32 |
+
jaxlib==0.1.68
|
33 |
+
joblib==1.0.1
|
34 |
+
keras-nightly==2.5.0.dev2021032900
|
35 |
+
keras-preprocessing==1.1.2
|
36 |
+
kiwisolver==1.3.1
|
37 |
+
libtpu-nightly==0.1.dev20210615
|
38 |
+
markdown==3.3.4
|
39 |
+
matplotlib==3.4.2
|
40 |
+
msgpack==1.0.2
|
41 |
+
multidict==5.1.0
|
42 |
+
multiprocess==0.70.12.2
|
43 |
+
numpy==1.19.5
|
44 |
+
oauthlib==3.1.1
|
45 |
+
opt-einsum==3.3.0
|
46 |
+
optax==0.0.9
|
47 |
+
packaging==21.0
|
48 |
+
pandas==1.3.0
|
49 |
+
pathtools==0.1.2
|
50 |
+
pillow==8.3.1
|
51 |
+
pip==20.0.2
|
52 |
+
pkg-resources==0.0.0
|
53 |
+
promise==2.3
|
54 |
+
protobuf==3.17.3
|
55 |
+
psutil==5.8.0
|
56 |
+
pyarrow==4.0.1
|
57 |
+
pyasn1-modules==0.2.8
|
58 |
+
pyasn1==0.4.8
|
59 |
+
pyparsing==2.4.7
|
60 |
+
python-dateutil==2.8.1
|
61 |
+
pytz==2021.1
|
62 |
+
pyyaml==5.4.1
|
63 |
+
regex==2021.7.6
|
64 |
+
requests-oauthlib==1.3.0
|
65 |
+
requests==2.25.1
|
66 |
+
rsa==4.7.2
|
67 |
+
sacremoses==0.0.45
|
68 |
+
scipy==1.7.0
|
69 |
+
sentry-sdk==1.3.0
|
70 |
+
setuptools==44.0.0
|
71 |
+
shortuuid==1.0.1
|
72 |
+
six==1.15.0
|
73 |
+
smmap==4.0.0
|
74 |
+
subprocess32==3.5.4
|
75 |
+
tensorboard-data-server==0.6.1
|
76 |
+
tensorboard-plugin-wit==1.8.0
|
77 |
+
tensorboard==2.5.0
|
78 |
+
tensorflow-estimator==2.5.0
|
79 |
+
tensorflow==2.5.0
|
80 |
+
termcolor==1.1.0
|
81 |
+
tokenizers==0.10.3
|
82 |
+
toolz==0.11.1
|
83 |
+
tqdm==4.61.2
|
84 |
+
transformers==4.9.0.dev0
|
85 |
+
typing-extensions==3.7.4.3
|
86 |
+
urllib3==1.26.6
|
87 |
+
wandb==0.10.33
|
88 |
+
werkzeug==2.0.1
|
89 |
+
wheel==0.36.2
|
90 |
+
wrapt==1.12.1
|
91 |
+
xxhash==2.0.2
|
92 |
+
yarl==1.6.3
|
wandb/run-20210713_005301-2ilkub1o/files/wandb-metadata.json
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
|
3 |
+
"python": "3.8.10",
|
4 |
+
"heartbeatAt": "2021-07-13T00:53:03.462705",
|
5 |
+
"startedAt": "2021-07-13T00:53:01.400550",
|
6 |
+
"docker": null,
|
7 |
+
"cpu_count": 96,
|
8 |
+
"cuda": null,
|
9 |
+
"args": [
|
10 |
+
"--push_to_hub",
|
11 |
+
"--output_dir=./",
|
12 |
+
"--model_type=big_bird",
|
13 |
+
"--config_name=./",
|
14 |
+
"--tokenizer_name=./",
|
15 |
+
"--max_seq_length=4096",
|
16 |
+
"--weight_decay=0.0095",
|
17 |
+
"--warmup_steps=5000",
|
18 |
+
"--overwrite_output_dir",
|
19 |
+
"--adam_beta1=0.9",
|
20 |
+
"--adam_beta2=0.98",
|
21 |
+
"--logging_steps=500",
|
22 |
+
"--eval_steps=92768",
|
23 |
+
"--num_train_epochs=5",
|
24 |
+
"--preprocessing_num_workers=64",
|
25 |
+
"--save_steps=20000",
|
26 |
+
"--adafactor",
|
27 |
+
"--learning_rate=5e-5",
|
28 |
+
"--per_device_train_batch_size=4",
|
29 |
+
"--per_device_eval_batch_size=4",
|
30 |
+
"--save_total_limit=5",
|
31 |
+
"--dtype=bfloat16",
|
32 |
+
"--gradient_accumulation_steps=8"
|
33 |
+
],
|
34 |
+
"state": "running",
|
35 |
+
"program": "./run_mlm_flax.py",
|
36 |
+
"codePath": "run_mlm_flax.py",
|
37 |
+
"git": {
|
38 |
+
"remote": "https://huggingface.co/flax-community/pino-roberta-base",
|
39 |
+
"commit": "4229c91b780cf07115cc6d04c16e393b0d2f508c"
|
40 |
+
},
|
41 |
+
"email": null,
|
42 |
+
"root": "/home/dat/pino-roberta-base",
|
43 |
+
"host": "t1v-n-f5c06ea1-w-0",
|
44 |
+
"username": "dat",
|
45 |
+
"executable": "/home/dat/pino/bin/python"
|
46 |
+
}
|
wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{}
|
wandb/run-20210713_005301-2ilkub1o/logs/debug-internal.log
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2021-07-13 00:53:02,112 INFO MainThread:327506 [internal.py:wandb_internal():88] W&B internal server running at pid: 327506, started at: 2021-07-13 00:53:02.112234
|
2 |
+
2021-07-13 00:53:02,114 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: check_version
|
3 |
+
2021-07-13 00:53:02,114 INFO WriterThread:327506 [datastore.py:open_for_write():80] open: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/run-2ilkub1o.wandb
|
4 |
+
2021-07-13 00:53:02,115 DEBUG SenderThread:327506 [sender.py:send():179] send: header
|
5 |
+
2021-07-13 00:53:02,116 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: check_version
|
6 |
+
2021-07-13 00:53:02,154 DEBUG SenderThread:327506 [sender.py:send():179] send: run
|
7 |
+
2021-07-13 00:53:02,328 INFO SenderThread:327506 [dir_watcher.py:__init__():168] watching files in: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files
|
8 |
+
2021-07-13 00:53:02,329 INFO SenderThread:327506 [sender.py:_start_run_threads():716] run started: 2ilkub1o with start time 1626137581
|
9 |
+
2021-07-13 00:53:02,345 DEBUG SenderThread:327506 [sender.py:send():179] send: summary
|
10 |
+
2021-07-13 00:53:02,345 INFO SenderThread:327506 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
|
11 |
+
2021-07-13 00:53:02,346 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: run_start
|
12 |
+
2021-07-13 00:53:03,330 INFO Thread-8 :327506 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json
|
13 |
+
2021-07-13 00:53:03,462 DEBUG HandlerThread:327506 [meta.py:__init__():39] meta init
|
14 |
+
2021-07-13 00:53:03,462 DEBUG HandlerThread:327506 [meta.py:__init__():53] meta init done
|
15 |
+
2021-07-13 00:53:03,462 DEBUG HandlerThread:327506 [meta.py:probe():210] probe
|
16 |
+
2021-07-13 00:53:03,463 DEBUG HandlerThread:327506 [meta.py:_setup_git():200] setup git
|
17 |
+
2021-07-13 00:53:03,492 DEBUG HandlerThread:327506 [meta.py:_setup_git():207] setup git done
|
18 |
+
2021-07-13 00:53:03,492 DEBUG HandlerThread:327506 [meta.py:_save_pip():57] save pip
|
19 |
+
2021-07-13 00:53:03,493 DEBUG HandlerThread:327506 [meta.py:_save_pip():71] save pip done
|
20 |
+
2021-07-13 00:53:03,493 DEBUG HandlerThread:327506 [meta.py:probe():252] probe done
|
21 |
+
2021-07-13 00:53:03,496 DEBUG SenderThread:327506 [sender.py:send():179] send: files
|
22 |
+
2021-07-13 00:53:03,496 INFO SenderThread:327506 [sender.py:_save_file():841] saving file wandb-metadata.json with policy now
|
23 |
+
2021-07-13 00:53:03,504 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
|
24 |
+
2021-07-13 00:53:03,504 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
|
25 |
+
2021-07-13 00:53:03,635 DEBUG SenderThread:327506 [sender.py:send():179] send: config
|
26 |
+
2021-07-13 00:53:03,636 DEBUG SenderThread:327506 [sender.py:send():179] send: config
|
27 |
+
2021-07-13 00:53:03,636 DEBUG SenderThread:327506 [sender.py:send():179] send: config
|
28 |
+
2021-07-13 00:53:03,952 INFO Thread-11 :327506 [upload_job.py:push():137] Uploaded file /tmp/tmpi8r4kiyhwandb/3l6ji67i-wandb-metadata.json
|
29 |
+
2021-07-13 00:53:04,330 INFO Thread-8 :327506 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-metadata.json
|
30 |
+
2021-07-13 00:53:04,330 INFO Thread-8 :327506 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/requirements.txt
|
31 |
+
2021-07-13 00:53:04,330 INFO Thread-8 :327506 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
|
32 |
+
2021-07-13 00:53:18,637 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
|
33 |
+
2021-07-13 00:53:18,637 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
|
34 |
+
2021-07-13 00:53:20,336 INFO Thread-8 :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
|
35 |
+
2021-07-13 00:53:22,336 INFO Thread-8 :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
|
36 |
+
2021-07-13 00:53:31,548 DEBUG SenderThread:327506 [sender.py:send():179] send: stats
|
37 |
+
2021-07-13 00:53:33,340 INFO Thread-8 :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/config.yaml
|
38 |
+
2021-07-13 00:53:33,769 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
|
39 |
+
2021-07-13 00:53:33,769 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
|
40 |
+
2021-07-13 00:53:48,899 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
|
41 |
+
2021-07-13 00:53:48,899 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
|
42 |
+
2021-07-13 00:54:01,629 DEBUG SenderThread:327506 [sender.py:send():179] send: stats
|
43 |
+
2021-07-13 00:54:04,032 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
|
44 |
+
2021-07-13 00:54:04,032 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
|
45 |
+
2021-07-13 00:54:12,355 INFO Thread-8 :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
|
46 |
+
2021-07-13 00:54:19,178 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
|
47 |
+
2021-07-13 00:54:19,179 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
|
48 |
+
2021-07-13 00:54:31,708 DEBUG SenderThread:327506 [sender.py:send():179] send: stats
|
49 |
+
2021-07-13 00:54:34,599 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
|
50 |
+
2021-07-13 00:54:34,599 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
|
51 |
+
2021-07-13 00:54:49,798 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
|
52 |
+
2021-07-13 00:54:49,798 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
|
53 |
+
2021-07-13 00:55:01,792 DEBUG SenderThread:327506 [sender.py:send():179] send: stats
|
54 |
+
2021-07-13 00:55:04,931 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
|
55 |
+
2021-07-13 00:55:04,931 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
|
56 |
+
2021-07-13 00:55:20,062 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
|
57 |
+
2021-07-13 00:55:20,062 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
|
58 |
+
2021-07-13 00:55:31,868 DEBUG SenderThread:327506 [sender.py:send():179] send: stats
|
59 |
+
2021-07-13 00:55:35,203 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: stop_status
|
60 |
+
2021-07-13 00:55:35,204 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: stop_status
|
61 |
+
2021-07-13 00:55:44,391 INFO Thread-8 :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
|
62 |
+
2021-07-13 00:55:45,566 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
|
63 |
+
2021-07-13 00:55:45,567 DEBUG SenderThread:327506 [sender.py:send():179] send: telemetry
|
64 |
+
2021-07-13 00:55:45,567 DEBUG SenderThread:327506 [sender.py:send():179] send: exit
|
65 |
+
2021-07-13 00:55:45,567 INFO SenderThread:327506 [sender.py:send_exit():287] handling exit code: 1
|
66 |
+
2021-07-13 00:55:45,567 INFO SenderThread:327506 [sender.py:send_exit():295] send defer
|
67 |
+
2021-07-13 00:55:45,567 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
|
68 |
+
2021-07-13 00:55:45,568 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
|
69 |
+
2021-07-13 00:55:45,568 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 0
|
70 |
+
2021-07-13 00:55:45,568 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
|
71 |
+
2021-07-13 00:55:45,568 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 0
|
72 |
+
2021-07-13 00:55:45,569 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 1
|
73 |
+
2021-07-13 00:55:45,569 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
|
74 |
+
2021-07-13 00:55:45,569 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 1
|
75 |
+
2021-07-13 00:55:45,601 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
|
76 |
+
2021-07-13 00:55:45,601 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 1
|
77 |
+
2021-07-13 00:55:45,601 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 2
|
78 |
+
2021-07-13 00:55:45,602 DEBUG SenderThread:327506 [sender.py:send():179] send: stats
|
79 |
+
2021-07-13 00:55:45,602 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
|
80 |
+
2021-07-13 00:55:45,602 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 2
|
81 |
+
2021-07-13 00:55:45,602 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
|
82 |
+
2021-07-13 00:55:45,602 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 2
|
83 |
+
2021-07-13 00:55:45,602 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 3
|
84 |
+
2021-07-13 00:55:45,603 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
|
85 |
+
2021-07-13 00:55:45,603 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 3
|
86 |
+
2021-07-13 00:55:45,603 DEBUG SenderThread:327506 [sender.py:send():179] send: summary
|
87 |
+
2021-07-13 00:55:45,603 INFO SenderThread:327506 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
|
88 |
+
2021-07-13 00:55:45,603 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
|
89 |
+
2021-07-13 00:55:45,604 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 3
|
90 |
+
2021-07-13 00:55:45,604 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 4
|
91 |
+
2021-07-13 00:55:45,604 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
|
92 |
+
2021-07-13 00:55:45,604 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 4
|
93 |
+
2021-07-13 00:55:45,604 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
|
94 |
+
2021-07-13 00:55:45,604 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 4
|
95 |
+
2021-07-13 00:55:45,670 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
|
96 |
+
2021-07-13 00:55:45,784 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 5
|
97 |
+
2021-07-13 00:55:45,784 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
|
98 |
+
2021-07-13 00:55:45,785 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
|
99 |
+
2021-07-13 00:55:45,785 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 5
|
100 |
+
2021-07-13 00:55:45,785 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
|
101 |
+
2021-07-13 00:55:45,785 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 5
|
102 |
+
2021-07-13 00:55:45,786 INFO SenderThread:327506 [dir_watcher.py:finish():282] shutting down directory watcher
|
103 |
+
2021-07-13 00:55:45,887 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
|
104 |
+
2021-07-13 00:55:46,391 INFO Thread-8 :327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json
|
105 |
+
2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/config.yaml
|
106 |
+
2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
|
107 |
+
2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:finish():312] scan: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files
|
108 |
+
2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/requirements.txt requirements.txt
|
109 |
+
2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log output.log
|
110 |
+
2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-metadata.json wandb-metadata.json
|
111 |
+
2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/config.yaml config.yaml
|
112 |
+
2021-07-13 00:55:46,392 INFO SenderThread:327506 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json wandb-summary.json
|
113 |
+
2021-07-13 00:55:46,393 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 6
|
114 |
+
2021-07-13 00:55:46,393 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
|
115 |
+
2021-07-13 00:55:46,403 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
|
116 |
+
2021-07-13 00:55:46,403 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 6
|
117 |
+
2021-07-13 00:55:46,405 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
|
118 |
+
2021-07-13 00:55:46,405 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 6
|
119 |
+
2021-07-13 00:55:46,405 INFO SenderThread:327506 [file_pusher.py:finish():177] shutting down file pusher
|
120 |
+
2021-07-13 00:55:46,495 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
|
121 |
+
2021-07-13 00:55:46,496 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
|
122 |
+
2021-07-13 00:55:46,598 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
|
123 |
+
2021-07-13 00:55:46,598 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
|
124 |
+
2021-07-13 00:55:46,700 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
|
125 |
+
2021-07-13 00:55:46,700 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
|
126 |
+
2021-07-13 00:55:46,802 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
|
127 |
+
2021-07-13 00:55:46,802 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
|
128 |
+
2021-07-13 00:55:46,867 INFO Thread-14 :327506 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/config.yaml
|
129 |
+
2021-07-13 00:55:46,874 INFO Thread-15 :327506 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/wandb-summary.json
|
130 |
+
2021-07-13 00:55:46,876 INFO Thread-13 :327506 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/output.log
|
131 |
+
2021-07-13 00:55:46,904 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
|
132 |
+
2021-07-13 00:55:46,905 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
|
133 |
+
2021-07-13 00:55:46,935 INFO Thread-12 :327506 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/files/requirements.txt
|
134 |
+
2021-07-13 00:55:47,007 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
|
135 |
+
2021-07-13 00:55:47,007 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
|
136 |
+
2021-07-13 00:55:47,109 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
|
137 |
+
2021-07-13 00:55:47,109 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
|
138 |
+
2021-07-13 00:55:47,135 INFO Thread-7 :327506 [sender.py:transition_state():308] send defer: 7
|
139 |
+
2021-07-13 00:55:47,136 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
|
140 |
+
2021-07-13 00:55:47,136 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 7
|
141 |
+
2021-07-13 00:55:47,136 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
|
142 |
+
2021-07-13 00:55:47,136 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 7
|
143 |
+
2021-07-13 00:55:47,211 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
|
144 |
+
2021-07-13 00:55:47,415 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 8
|
145 |
+
2021-07-13 00:55:47,416 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
|
146 |
+
2021-07-13 00:55:47,416 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
|
147 |
+
2021-07-13 00:55:47,416 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 8
|
148 |
+
2021-07-13 00:55:47,416 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
|
149 |
+
2021-07-13 00:55:47,417 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 8
|
150 |
+
2021-07-13 00:55:47,417 INFO SenderThread:327506 [sender.py:transition_state():308] send defer: 9
|
151 |
+
2021-07-13 00:55:47,417 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: defer
|
152 |
+
2021-07-13 00:55:47,417 INFO HandlerThread:327506 [handler.py:handle_request_defer():141] handle defer: 9
|
153 |
+
2021-07-13 00:55:47,417 DEBUG SenderThread:327506 [sender.py:send():179] send: final
|
154 |
+
2021-07-13 00:55:47,417 DEBUG SenderThread:327506 [sender.py:send():179] send: footer
|
155 |
+
2021-07-13 00:55:47,417 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: defer
|
156 |
+
2021-07-13 00:55:47,418 INFO SenderThread:327506 [sender.py:send_request_defer():304] handle sender defer: 9
|
157 |
+
2021-07-13 00:55:47,518 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: poll_exit
|
158 |
+
2021-07-13 00:55:47,518 DEBUG SenderThread:327506 [sender.py:send_request():193] send_request: poll_exit
|
159 |
+
2021-07-13 00:55:47,518 INFO SenderThread:327506 [file_pusher.py:join():182] waiting for file pusher
|
160 |
+
2021-07-13 00:55:47,520 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: get_summary
|
161 |
+
2021-07-13 00:55:47,521 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: sampled_history
|
162 |
+
2021-07-13 00:55:47,521 DEBUG HandlerThread:327506 [handler.py:handle_request():124] handle_request: shutdown
|
163 |
+
2021-07-13 00:55:47,521 INFO HandlerThread:327506 [handler.py:finish():638] shutting down handler
|
164 |
+
2021-07-13 00:55:48,418 INFO WriterThread:327506 [datastore.py:close():288] close: /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/run-2ilkub1o.wandb
|
165 |
+
2021-07-13 00:55:48,518 INFO SenderThread:327506 [sender.py:finish():945] shutting down sender
|
166 |
+
2021-07-13 00:55:48,519 INFO SenderThread:327506 [file_pusher.py:finish():177] shutting down file pusher
|
167 |
+
2021-07-13 00:55:48,519 INFO SenderThread:327506 [file_pusher.py:join():182] waiting for file pusher
|
168 |
+
2021-07-13 00:55:48,521 INFO MainThread:327506 [internal.py:handle_exit():78] Internal process exited
|
wandb/run-20210713_005301-2ilkub1o/logs/debug.log
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_setup.py:_flush():69] setting env: {}
|
2 |
+
2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_setup.py:_flush():69] setting login settings: {}
|
3 |
+
2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_init.py:_log_setup():337] Logging user logs to /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/logs/debug.log
|
4 |
+
2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_init.py:_log_setup():338] Logging internal logs to /home/dat/pino-roberta-base/wandb/run-20210713_005301-2ilkub1o/logs/debug-internal.log
|
5 |
+
2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_init.py:init():370] calling init triggers
|
6 |
+
2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_init.py:init():375] wandb.init called with sweep_config: {}
|
7 |
+
config: {}
|
8 |
+
2021-07-13 00:53:01,402 INFO MainThread:325900 [wandb_init.py:init():419] starting backend
|
9 |
+
2021-07-13 00:53:01,402 INFO MainThread:325900 [backend.py:_multiprocessing_setup():70] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
10 |
+
2021-07-13 00:53:01,457 INFO MainThread:325900 [backend.py:ensure_launched():135] starting backend process...
|
11 |
+
2021-07-13 00:53:01,509 INFO MainThread:325900 [backend.py:ensure_launched():139] started backend process with pid: 327506
|
12 |
+
2021-07-13 00:53:01,511 INFO MainThread:325900 [wandb_init.py:init():424] backend started and connected
|
13 |
+
2021-07-13 00:53:01,514 INFO MainThread:325900 [wandb_init.py:init():472] updated telemetry
|
14 |
+
2021-07-13 00:53:01,515 INFO MainThread:325900 [wandb_init.py:init():491] communicating current version
|
15 |
+
2021-07-13 00:53:02,153 INFO MainThread:325900 [wandb_init.py:init():496] got version response
|
16 |
+
2021-07-13 00:53:02,153 INFO MainThread:325900 [wandb_init.py:init():504] communicating run to backend with 30 second timeout
|
17 |
+
2021-07-13 00:53:02,345 INFO MainThread:325900 [wandb_init.py:init():529] starting run threads in backend
|
18 |
+
2021-07-13 00:53:03,501 INFO MainThread:325900 [wandb_run.py:_console_start():1623] atexit reg
|
19 |
+
2021-07-13 00:53:03,501 INFO MainThread:325900 [wandb_run.py:_redirect():1497] redirect: SettingsConsole.REDIRECT
|
20 |
+
2021-07-13 00:53:03,502 INFO MainThread:325900 [wandb_run.py:_redirect():1502] Redirecting console.
|
21 |
+
2021-07-13 00:53:03,504 INFO MainThread:325900 [wandb_run.py:_redirect():1558] Redirects installed.
|
22 |
+
2021-07-13 00:53:03,504 INFO MainThread:325900 [wandb_init.py:init():554] run started, returning control to user process
|
23 |
+
2021-07-13 00:53:03,510 INFO MainThread:325900 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 5000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul13_00-52-13_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 500, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 20000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 92768, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': True, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''}
|
24 |
+
2021-07-13 00:53:03,512 INFO MainThread:325900 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
|
25 |
+
2021-07-13 00:53:03,513 INFO MainThread:325900 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 64, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False}
|
26 |
+
2021-07-13 00:55:43,384 INFO MainThread:325900 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 1
|
27 |
+
2021-07-13 00:55:43,385 INFO MainThread:325900 [wandb_run.py:_restore():1565] restore
|
28 |
+
2021-07-13 00:55:45,569 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
29 |
+
wandb_count: 1
|
30 |
+
}
|
31 |
+
pusher_stats {
|
32 |
+
uploaded_bytes: 1417
|
33 |
+
total_bytes: 1417
|
34 |
+
}
|
35 |
+
|
36 |
+
2021-07-13 00:55:45,785 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
37 |
+
wandb_count: 1
|
38 |
+
}
|
39 |
+
pusher_stats {
|
40 |
+
uploaded_bytes: 1417
|
41 |
+
total_bytes: 1417
|
42 |
+
}
|
43 |
+
|
44 |
+
2021-07-13 00:55:46,394 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
45 |
+
wandb_count: 4
|
46 |
+
}
|
47 |
+
pusher_stats {
|
48 |
+
uploaded_bytes: 1417
|
49 |
+
total_bytes: 40394
|
50 |
+
}
|
51 |
+
|
52 |
+
2021-07-13 00:55:46,496 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
53 |
+
wandb_count: 5
|
54 |
+
}
|
55 |
+
pusher_stats {
|
56 |
+
uploaded_bytes: 1417
|
57 |
+
total_bytes: 40396
|
58 |
+
}
|
59 |
+
|
60 |
+
2021-07-13 00:55:46,598 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
61 |
+
wandb_count: 5
|
62 |
+
}
|
63 |
+
pusher_stats {
|
64 |
+
uploaded_bytes: 40396
|
65 |
+
total_bytes: 40396
|
66 |
+
}
|
67 |
+
|
68 |
+
2021-07-13 00:55:46,701 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
69 |
+
wandb_count: 5
|
70 |
+
}
|
71 |
+
pusher_stats {
|
72 |
+
uploaded_bytes: 40396
|
73 |
+
total_bytes: 40396
|
74 |
+
}
|
75 |
+
|
76 |
+
2021-07-13 00:55:46,803 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
77 |
+
wandb_count: 5
|
78 |
+
}
|
79 |
+
pusher_stats {
|
80 |
+
uploaded_bytes: 40396
|
81 |
+
total_bytes: 40396
|
82 |
+
}
|
83 |
+
|
84 |
+
2021-07-13 00:55:46,905 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
85 |
+
wandb_count: 5
|
86 |
+
}
|
87 |
+
pusher_stats {
|
88 |
+
uploaded_bytes: 40396
|
89 |
+
total_bytes: 40396
|
90 |
+
}
|
91 |
+
|
92 |
+
2021-07-13 00:55:47,008 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
93 |
+
wandb_count: 5
|
94 |
+
}
|
95 |
+
pusher_stats {
|
96 |
+
uploaded_bytes: 40396
|
97 |
+
total_bytes: 40396
|
98 |
+
}
|
99 |
+
|
100 |
+
2021-07-13 00:55:47,109 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
101 |
+
wandb_count: 5
|
102 |
+
}
|
103 |
+
pusher_stats {
|
104 |
+
uploaded_bytes: 40396
|
105 |
+
total_bytes: 40396
|
106 |
+
}
|
107 |
+
|
108 |
+
2021-07-13 00:55:47,416 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts {
|
109 |
+
wandb_count: 5
|
110 |
+
}
|
111 |
+
pusher_stats {
|
112 |
+
uploaded_bytes: 40396
|
113 |
+
total_bytes: 40396
|
114 |
+
}
|
115 |
+
|
116 |
+
2021-07-13 00:55:47,519 INFO MainThread:325900 [wandb_run.py:_wait_for_finish():1715] got exit ret: done: true
|
117 |
+
exit_result {
|
118 |
+
}
|
119 |
+
file_counts {
|
120 |
+
wandb_count: 5
|
121 |
+
}
|
122 |
+
pusher_stats {
|
123 |
+
uploaded_bytes: 40396
|
124 |
+
total_bytes: 40396
|
125 |
+
}
|
126 |
+
|
127 |
+
2021-07-13 00:55:48,779 INFO MainThread:325900 [wandb_run.py:_show_files():1937] logging synced files
|
wandb/run-20210713_005301-2ilkub1o/run-2ilkub1o.wandb
ADDED
Binary file (37.4 kB). View file
|
|
wandb/run-20210713_005751-1wnn0lyf/files/config.yaml
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
_wandb:
|
4 |
+
desc: null
|
5 |
+
value:
|
6 |
+
cli_version: 0.10.33
|
7 |
+
framework: huggingface
|
8 |
+
huggingface_version: 4.9.0.dev0
|
9 |
+
is_jupyter_run: false
|
10 |
+
is_kaggle_kernel: false
|
11 |
+
python_version: 3.8.10
|
12 |
+
t:
|
13 |
+
1:
|
14 |
+
- 3
|
15 |
+
- 11
|
16 |
+
4: 3.8.10
|
17 |
+
5: 0.10.33
|
18 |
+
6: 4.9.0.dev0
|
19 |
+
8:
|
20 |
+
- 5
|
21 |
+
adafactor:
|
22 |
+
desc: null
|
23 |
+
value: false
|
24 |
+
adam_beta1:
|
25 |
+
desc: null
|
26 |
+
value: 0.9
|
27 |
+
adam_beta2:
|
28 |
+
desc: null
|
29 |
+
value: 0.98
|
30 |
+
adam_epsilon:
|
31 |
+
desc: null
|
32 |
+
value: 1.0e-08
|
33 |
+
cache_dir:
|
34 |
+
desc: null
|
35 |
+
value: null
|
36 |
+
config_name:
|
37 |
+
desc: null
|
38 |
+
value: ./
|
39 |
+
dataloader_drop_last:
|
40 |
+
desc: null
|
41 |
+
value: false
|
42 |
+
dataloader_num_workers:
|
43 |
+
desc: null
|
44 |
+
value: 0
|
45 |
+
dataloader_pin_memory:
|
46 |
+
desc: null
|
47 |
+
value: true
|
48 |
+
dataset_config_name:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
dataset_name:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
ddp_find_unused_parameters:
|
55 |
+
desc: null
|
56 |
+
value: null
|
57 |
+
debug:
|
58 |
+
desc: null
|
59 |
+
value: []
|
60 |
+
deepspeed:
|
61 |
+
desc: null
|
62 |
+
value: null
|
63 |
+
disable_tqdm:
|
64 |
+
desc: null
|
65 |
+
value: false
|
66 |
+
do_eval:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
do_predict:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
do_train:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
dtype:
|
76 |
+
desc: null
|
77 |
+
value: bfloat16
|
78 |
+
eval_accumulation_steps:
|
79 |
+
desc: null
|
80 |
+
value: null
|
81 |
+
eval_steps:
|
82 |
+
desc: null
|
83 |
+
value: 92768
|
84 |
+
evaluation_strategy:
|
85 |
+
desc: null
|
86 |
+
value: IntervalStrategy.NO
|
87 |
+
fp16:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
fp16_backend:
|
91 |
+
desc: null
|
92 |
+
value: auto
|
93 |
+
fp16_full_eval:
|
94 |
+
desc: null
|
95 |
+
value: false
|
96 |
+
fp16_opt_level:
|
97 |
+
desc: null
|
98 |
+
value: O1
|
99 |
+
gradient_accumulation_steps:
|
100 |
+
desc: null
|
101 |
+
value: 1
|
102 |
+
greater_is_better:
|
103 |
+
desc: null
|
104 |
+
value: null
|
105 |
+
group_by_length:
|
106 |
+
desc: null
|
107 |
+
value: false
|
108 |
+
ignore_data_skip:
|
109 |
+
desc: null
|
110 |
+
value: false
|
111 |
+
label_names:
|
112 |
+
desc: null
|
113 |
+
value: null
|
114 |
+
label_smoothing_factor:
|
115 |
+
desc: null
|
116 |
+
value: 0.0
|
117 |
+
learning_rate:
|
118 |
+
desc: null
|
119 |
+
value: 5.0e-05
|
120 |
+
length_column_name:
|
121 |
+
desc: null
|
122 |
+
value: length
|
123 |
+
line_by_line:
|
124 |
+
desc: null
|
125 |
+
value: false
|
126 |
+
load_best_model_at_end:
|
127 |
+
desc: null
|
128 |
+
value: false
|
129 |
+
local_rank:
|
130 |
+
desc: null
|
131 |
+
value: -1
|
132 |
+
log_level:
|
133 |
+
desc: null
|
134 |
+
value: -1
|
135 |
+
log_level_replica:
|
136 |
+
desc: null
|
137 |
+
value: -1
|
138 |
+
log_on_each_node:
|
139 |
+
desc: null
|
140 |
+
value: true
|
141 |
+
logging_dir:
|
142 |
+
desc: null
|
143 |
+
value: ./runs/Jul13_00-57-01_t1v-n-f5c06ea1-w-0
|
144 |
+
logging_first_step:
|
145 |
+
desc: null
|
146 |
+
value: false
|
147 |
+
logging_steps:
|
148 |
+
desc: null
|
149 |
+
value: 500
|
150 |
+
logging_strategy:
|
151 |
+
desc: null
|
152 |
+
value: IntervalStrategy.STEPS
|
153 |
+
lr_scheduler_type:
|
154 |
+
desc: null
|
155 |
+
value: SchedulerType.LINEAR
|
156 |
+
max_grad_norm:
|
157 |
+
desc: null
|
158 |
+
value: 1.0
|
159 |
+
max_seq_length:
|
160 |
+
desc: null
|
161 |
+
value: 4096
|
162 |
+
max_steps:
|
163 |
+
desc: null
|
164 |
+
value: -1
|
165 |
+
metric_for_best_model:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
mlm_probability:
|
169 |
+
desc: null
|
170 |
+
value: 0.15
|
171 |
+
model_name_or_path:
|
172 |
+
desc: null
|
173 |
+
value: null
|
174 |
+
model_type:
|
175 |
+
desc: null
|
176 |
+
value: big_bird
|
177 |
+
mp_parameters:
|
178 |
+
desc: null
|
179 |
+
value: ''
|
180 |
+
no_cuda:
|
181 |
+
desc: null
|
182 |
+
value: false
|
183 |
+
num_train_epochs:
|
184 |
+
desc: null
|
185 |
+
value: 5.0
|
186 |
+
output_dir:
|
187 |
+
desc: null
|
188 |
+
value: ./
|
189 |
+
overwrite_cache:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
overwrite_output_dir:
|
193 |
+
desc: null
|
194 |
+
value: true
|
195 |
+
pad_to_max_length:
|
196 |
+
desc: null
|
197 |
+
value: false
|
198 |
+
past_index:
|
199 |
+
desc: null
|
200 |
+
value: -1
|
201 |
+
per_device_eval_batch_size:
|
202 |
+
desc: null
|
203 |
+
value: 4
|
204 |
+
per_device_train_batch_size:
|
205 |
+
desc: null
|
206 |
+
value: 4
|
207 |
+
per_gpu_eval_batch_size:
|
208 |
+
desc: null
|
209 |
+
value: null
|
210 |
+
per_gpu_train_batch_size:
|
211 |
+
desc: null
|
212 |
+
value: null
|
213 |
+
prediction_loss_only:
|
214 |
+
desc: null
|
215 |
+
value: false
|
216 |
+
preprocessing_num_workers:
|
217 |
+
desc: null
|
218 |
+
value: 64
|
219 |
+
push_to_hub:
|
220 |
+
desc: null
|
221 |
+
value: true
|
222 |
+
push_to_hub_model_id:
|
223 |
+
desc: null
|
224 |
+
value: ''
|
225 |
+
push_to_hub_organization:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
push_to_hub_token:
|
229 |
+
desc: null
|
230 |
+
value: null
|
231 |
+
remove_unused_columns:
|
232 |
+
desc: null
|
233 |
+
value: true
|
234 |
+
report_to:
|
235 |
+
desc: null
|
236 |
+
value:
|
237 |
+
- tensorboard
|
238 |
+
- wandb
|
239 |
+
resume_from_checkpoint:
|
240 |
+
desc: null
|
241 |
+
value: null
|
242 |
+
run_name:
|
243 |
+
desc: null
|
244 |
+
value: ./
|
245 |
+
save_on_each_node:
|
246 |
+
desc: null
|
247 |
+
value: false
|
248 |
+
save_steps:
|
249 |
+
desc: null
|
250 |
+
value: 20000
|
251 |
+
save_strategy:
|
252 |
+
desc: null
|
253 |
+
value: IntervalStrategy.STEPS
|
254 |
+
save_total_limit:
|
255 |
+
desc: null
|
256 |
+
value: 5
|
257 |
+
seed:
|
258 |
+
desc: null
|
259 |
+
value: 42
|
260 |
+
sharded_ddp:
|
261 |
+
desc: null
|
262 |
+
value: []
|
263 |
+
skip_memory_metrics:
|
264 |
+
desc: null
|
265 |
+
value: true
|
266 |
+
tokenizer_name:
|
267 |
+
desc: null
|
268 |
+
value: ./
|
269 |
+
tpu_metrics_debug:
|
270 |
+
desc: null
|
271 |
+
value: false
|
272 |
+
tpu_num_cores:
|
273 |
+
desc: null
|
274 |
+
value: null
|
275 |
+
train_file:
|
276 |
+
desc: null
|
277 |
+
value: null
|
278 |
+
train_ref_file:
|
279 |
+
desc: null
|
280 |
+
value: null
|
281 |
+
use_fast_tokenizer:
|
282 |
+
desc: null
|
283 |
+
value: true
|
284 |
+
use_legacy_prediction_loop:
|
285 |
+
desc: null
|
286 |
+
value: false
|
287 |
+
validation_file:
|
288 |
+
desc: null
|
289 |
+
value: null
|
290 |
+
validation_ref_file:
|
291 |
+
desc: null
|
292 |
+
value: null
|
293 |
+
validation_split_percentage:
|
294 |
+
desc: null
|
295 |
+
value: 5
|
296 |
+
warmup_ratio:
|
297 |
+
desc: null
|
298 |
+
value: 0.0
|
299 |
+
warmup_steps:
|
300 |
+
desc: null
|
301 |
+
value: 5000
|
302 |
+
weight_decay:
|
303 |
+
desc: null
|
304 |
+
value: 0.0095
|
wandb/run-20210713_005751-1wnn0lyf/files/output.log
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:382: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
|
2 |
+
warnings.warn(
|
3 |
+
/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:369: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.
|
4 |
+
warnings.warn(
|
5 |
+
Epoch ... (1/5): 0%| | 0/5 [00:00<?, ?it/s]
|
6 |
+
Traceback (most recent call last): | 0/46383 [00:00<?, ?it/s]
|
7 |
+
File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
|
8 |
+
self.run()
|
9 |
+
File "/usr/lib/python3.8/threading.py", line 870, in run
|
10 |
+
self._target(*self._args, **self._kwargs)
|
11 |
+
File "/home/dat/pino/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 183, in check_network_status
|
12 |
+
status_response = self._interface.communicate_network_status()
|
13 |
+
File "/home/dat/pino/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 755, in communicate_network_status
|
14 |
+
resp = self._communicate(req, timeout=timeout, local=True)
|
15 |
+
File "/home/dat/pino/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate
|
16 |
+
return self._communicate_async(rec, local=local).get(timeout=timeout)
|
17 |
+
File "/home/dat/pino/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async
|
18 |
+
raise Exception("The wandb backend process has shutdown")
|
19 |
+
Exception: The wandb backend process has shutdown
|
20 |
+
Training...: 0%| | 0/46383 [01:25<?, ?it/s]
|
21 |
+
Epoch ... (1/5): 0%| | 0/5 [02:13<?, ?it/s]
|
22 |
+
Traceback (most recent call last):
|
23 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
|
24 |
+
return fun(*args, **kwargs)
|
25 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/api.py", line 1647, in f_pmapped
|
26 |
+
out = pxla.xla_pmap(
|
27 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1620, in bind
|
28 |
+
return call_bind(self, fun, *args, **params)
|
29 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1551, in call_bind
|
30 |
+
outs = primitive.process(top_trace, fun, tracers, params)
|
31 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 1623, in process
|
32 |
+
return trace.process_map(self, fun, tracers, params)
|
33 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/core.py", line 606, in process_call
|
34 |
+
return primitive.impl(f, *tracers, **params)
|
35 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 624, in xla_pmap_impl
|
36 |
+
compiled_fun, fingerprint = parallel_callable(fun, backend, axis_name, axis_size,
|
37 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/linear_util.py", line 262, in memoized_fun
|
38 |
+
ans = call(fun, *args)
|
39 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 899, in parallel_callable
|
40 |
+
compiled = xla.backend_compile(backend, built, compile_options)
|
41 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 360, in backend_compile
|
42 |
+
return backend.compile(built_c, compile_options=options)
|
43 |
+
RuntimeError: Resource exhausted: Ran out of memory in memory space hbm. Used 20.30G of 15.48G hbm. Exceeded hbm capacity by 4.82G.
|
44 |
+
Total hbm usage >= 20.82G:
|
45 |
+
reserved 530.00M
|
46 |
+
program 20.30G
|
47 |
+
arguments 0B
|
48 |
+
Output size 0B; shares 0B with arguments.
|
49 |
+
Program hbm requirement 20.30G:
|
50 |
+
global 660.0K
|
51 |
+
scoped 125.0K
|
52 |
+
HLO temp 20.30G (63.5% utilization: Unpadded (12.44G) Padded (19.60G), 3.5% fragmentation (717.54M))
|
53 |
+
Largest program allocations in hbm:
|
54 |
+
1. Size: 1.54G
|
55 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/dot_general[ dimension_numbers=(((2,), (0,)), ((), ()))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/pino/lib/python3.8/site-packages/flax/linen/linear.py" source_line=175
|
56 |
+
Shape: bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)}
|
57 |
+
Unpadded size: 1.54G
|
58 |
+
Extra memory due to padding: 64.0K (1.0x expansion)
|
59 |
+
XLA label: %fusion.1304.remat4 = bf16[4,4096,50358]{1,2,0:T(8,128)(2,1)} fusion(bf16[50358,768]{1,0:T(8,128)(2,1)} %copy.16213, f32[768]{0:T(1024)} %fusion.8859, f32[768]{0:T(1024)} %fusion.8860, f32[4,4096]{1,0:T(4,128)} %get-tuple-element.16597, f32[4,4096]{1,0:T(4...
|
60 |
+
Allocation type: HLO temp
|
61 |
+
==========================
|
62 |
+
2. Size: 360.00M
|
63 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
64 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
65 |
+
Unpadded size: 180.00M
|
66 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
67 |
+
XLA label: %fusion.135 = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.485, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5710, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} %get-tuple-element.16812, f32[4,12,60,64,192]{3,4,2,1,0...
|
68 |
+
Allocation type: HLO temp
|
69 |
+
==========================
|
70 |
+
3. Size: 360.00M
|
71 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
72 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
73 |
+
Unpadded size: 180.00M
|
74 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
75 |
+
XLA label: %fusion.144.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.494, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5719, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
|
76 |
+
Allocation type: HLO temp
|
77 |
+
==========================
|
78 |
+
4. Size: 360.00M
|
79 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
80 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
81 |
+
Unpadded size: 180.00M
|
82 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
83 |
+
XLA label: %fusion.143.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.493, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5718, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
|
84 |
+
Allocation type: HLO temp
|
85 |
+
==========================
|
86 |
+
5. Size: 360.00M
|
87 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
88 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
89 |
+
Unpadded size: 180.00M
|
90 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
91 |
+
XLA label: %fusion.142.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.492, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5717, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
|
92 |
+
Allocation type: HLO temp
|
93 |
+
==========================
|
94 |
+
6. Size: 360.00M
|
95 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
96 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
97 |
+
Unpadded size: 180.00M
|
98 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
99 |
+
XLA label: %fusion.141.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.491, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5716, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
|
100 |
+
Allocation type: HLO temp
|
101 |
+
==========================
|
102 |
+
7. Size: 360.00M
|
103 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
104 |
+
Unpadded size: 180.00M
|
105 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
106 |
+
XLA label: %fusion.134.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.134.remat_compressed)
|
107 |
+
Allocation type: HLO temp
|
108 |
+
==========================
|
109 |
+
8. Size: 360.00M
|
110 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
111 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
112 |
+
Unpadded size: 180.00M
|
113 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
114 |
+
XLA label: %fusion.140.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.490, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5715, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
|
115 |
+
Allocation type: HLO temp
|
116 |
+
==========================
|
117 |
+
9. Size: 360.00M
|
118 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
119 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
120 |
+
Unpadded size: 180.00M
|
121 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
122 |
+
XLA label: %fusion.139.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.489, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5714, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
|
123 |
+
Allocation type: HLO temp
|
124 |
+
==========================
|
125 |
+
10. Size: 360.00M
|
126 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
127 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
128 |
+
Unpadded size: 180.00M
|
129 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
130 |
+
XLA label: %fusion.138.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.488, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5713, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
|
131 |
+
Allocation type: HLO temp
|
132 |
+
==========================
|
133 |
+
11. Size: 360.00M
|
134 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
135 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
136 |
+
Unpadded size: 180.00M
|
137 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
138 |
+
XLA label: %fusion.137.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.487, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5712, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
|
139 |
+
Allocation type: HLO temp
|
140 |
+
==========================
|
141 |
+
12. Size: 360.00M
|
142 |
+
Operator: op_type="div" op_name="pmap(train_step)/div" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=619
|
143 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
144 |
+
Unpadded size: 180.00M
|
145 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
146 |
+
XLA label: %fusion.136.remat = (bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}, bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}) fusion(f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.486, f32[4,12,60,64]{3,2,1,0:T(8,128)} %fusion.5711, f32[4,12,60,64,64]{3,4,2,1,0:T(8,128)} ...
|
147 |
+
Allocation type: HLO temp
|
148 |
+
==========================
|
149 |
+
13. Size: 360.00M
|
150 |
+
Shape: bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)}
|
151 |
+
Unpadded size: 180.00M
|
152 |
+
Extra memory due to padding: 180.00M (2.0x expansion)
|
153 |
+
XLA label: %fusion.133.remat_uncompressed = bf16[4,12,60,64,512]{3,4,2,1,0:T(8,128)(2,1)} copy(bf16[4,12,60,64,512]{4,3,2,1,0:T(8,128)(2,1)} %fusion.133.remat_compressed)
|
154 |
+
Allocation type: HLO temp
|
155 |
+
==========================
|
156 |
+
14. Size: 270.00M
|
157 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
158 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
159 |
+
Unpadded size: 135.00M
|
160 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
161 |
+
XLA label: %fusion.378.remat5 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.17038, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14428, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.655), kind=kOut...
|
162 |
+
Allocation type: HLO temp
|
163 |
+
==========================
|
164 |
+
15. Size: 270.00M
|
165 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
166 |
+
Unpadded size: 135.00M
|
167 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
168 |
+
XLA label: %fusion.310.remat_uncompressed = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.310.remat_compressed)
|
169 |
+
Allocation type: HLO temp
|
170 |
+
==========================
|
171 |
+
16. Size: 270.00M
|
172 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
173 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
174 |
+
Unpadded size: 135.00M
|
175 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
176 |
+
XLA label: %fusion.386.remat6 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.17038, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.13900, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.639), kind=kOut...
|
177 |
+
Allocation type: HLO temp
|
178 |
+
==========================
|
179 |
+
17. Size: 270.00M
|
180 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
181 |
+
Unpadded size: 135.00M
|
182 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
183 |
+
XLA label: %fusion.326.remat_uncompressed.remat2 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} copy(f32[4,12,60,64,192]{4,3,2,1,0:T(8,128)} %fusion.326.remat_compressed)
|
184 |
+
Allocation type: HLO temp
|
185 |
+
==========================
|
186 |
+
18. Size: 270.00M
|
187 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=591
|
188 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
189 |
+
Unpadded size: 135.00M
|
190 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
191 |
+
XLA label: %fusion.10361 = (f32[4,12,60,64]{3,2,1,0:T(8,128)}, f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}) fusion(s32[4,12,62,64,192]{3,4,2,1,0:T(8,128)} %get-tuple-element.18295, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14494, bf16[4,12,60,192,64]{3,2,1,0,4:T...
|
192 |
+
Allocation type: HLO temp
|
193 |
+
==========================
|
194 |
+
19. Size: 270.00M
|
195 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
196 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
197 |
+
Unpadded size: 135.00M
|
198 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
199 |
+
XLA label: %fusion.380.remat5 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.17038, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14296, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.651), kind=kOut...
|
200 |
+
Allocation type: HLO temp
|
201 |
+
==========================
|
202 |
+
20. Size: 270.00M
|
203 |
+
Operator: op_type="dot_general" op_name="pmap(train_step)/jit(jvp(_einsum))/dot_general[ dimension_numbers=(((4,), (4,)), ((0, 1, 2), (0, 1, 2)))\n precision=None\n preferred_element_type=None ]" source_file="/home/dat/transformers/src/transformers/models/big_bird/modeling_flax_big_bird.py" source_line=584
|
204 |
+
Shape: f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)}
|
205 |
+
Unpadded size: 135.00M
|
206 |
+
Extra memory due to padding: 135.00M (2.0x expansion)
|
207 |
+
XLA label: %fusion.379.remat3 = f32[4,12,60,64,192]{3,4,2,1,0:T(8,128)} fusion(f32[4,60,64,192]{2,3,1,0:T(8,128)} %get-tuple-element.17038, bf16[4,12,64,64,64]{4,3,2,1,0:T(8,128)(2,1)} %copy.14362, bf16[4,12,60,192,64]{3,2,4,1,0:T(8,128)(2,1)} %fusion.653), kind=kOut...
|
208 |
+
Allocation type: HLO temp
|
209 |
+
==========================
|
210 |
+
During handling of the above exception, another exception occurred:
|
211 |
+
Traceback (most recent call last):
|
212 |
+
File "./run_mlm_flax.py", line 709, in <module>
|
213 |
+
state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
|
214 |
+
File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 183, in reraise_with_filtered_traceback
|
215 |
+
return fun(*args, **kwargs)
|
216 |
+
KeyboardInterrupt
|
wandb/run-20210713_005751-1wnn0lyf/files/requirements.txt
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==0.13.0
|
2 |
+
aiohttp==3.7.4.post0
|
3 |
+
astunparse==1.6.3
|
4 |
+
async-timeout==3.0.1
|
5 |
+
attrs==21.2.0
|
6 |
+
cachetools==4.2.2
|
7 |
+
certifi==2021.5.30
|
8 |
+
chardet==4.0.0
|
9 |
+
chex==0.0.8
|
10 |
+
click==8.0.1
|
11 |
+
configparser==5.0.2
|
12 |
+
cycler==0.10.0
|
13 |
+
datasets==1.9.1.dev0
|
14 |
+
dill==0.3.4
|
15 |
+
dm-tree==0.1.6
|
16 |
+
docker-pycreds==0.4.0
|
17 |
+
filelock==3.0.12
|
18 |
+
flatbuffers==1.12
|
19 |
+
flax==0.3.4
|
20 |
+
fsspec==2021.6.1
|
21 |
+
gast==0.4.0
|
22 |
+
gitdb==4.0.7
|
23 |
+
gitpython==3.1.18
|
24 |
+
google-auth-oauthlib==0.4.4
|
25 |
+
google-auth==1.32.1
|
26 |
+
google-pasta==0.2.0
|
27 |
+
grpcio==1.34.1
|
28 |
+
h5py==3.1.0
|
29 |
+
huggingface-hub==0.0.12
|
30 |
+
idna==2.10
|
31 |
+
jax==0.2.16
|
32 |
+
jaxlib==0.1.68
|
33 |
+
joblib==1.0.1
|
34 |
+
keras-nightly==2.5.0.dev2021032900
|
35 |
+
keras-preprocessing==1.1.2
|
36 |
+
kiwisolver==1.3.1
|
37 |
+
libtpu-nightly==0.1.dev20210615
|
38 |
+
markdown==3.3.4
|
39 |
+
matplotlib==3.4.2
|
40 |
+
msgpack==1.0.2
|
41 |
+
multidict==5.1.0
|
42 |
+
multiprocess==0.70.12.2
|
43 |
+
numpy==1.19.5
|
44 |
+
oauthlib==3.1.1
|
45 |
+
opt-einsum==3.3.0
|
46 |
+
optax==0.0.9
|
47 |
+
packaging==21.0
|
48 |
+
pandas==1.3.0
|
49 |
+
pathtools==0.1.2
|
50 |
+
pillow==8.3.1
|
51 |
+
pip==20.0.2
|
52 |
+
pkg-resources==0.0.0
|
53 |
+
promise==2.3
|
54 |
+
protobuf==3.17.3
|
55 |
+
psutil==5.8.0
|
56 |
+
pyarrow==4.0.1
|
57 |
+
pyasn1-modules==0.2.8
|
58 |
+
pyasn1==0.4.8
|
59 |
+
pyparsing==2.4.7
|
60 |
+
python-dateutil==2.8.1
|
61 |
+
pytz==2021.1
|
62 |
+
pyyaml==5.4.1
|
63 |
+
regex==2021.7.6
|
64 |
+
requests-oauthlib==1.3.0
|
65 |
+
requests==2.25.1
|
66 |
+
rsa==4.7.2
|
67 |
+
sacremoses==0.0.45
|
68 |
+
scipy==1.7.0
|
69 |
+
sentry-sdk==1.3.0
|
70 |
+
setuptools==44.0.0
|
71 |
+
shortuuid==1.0.1
|
72 |
+
six==1.15.0
|
73 |
+
smmap==4.0.0
|
74 |
+
subprocess32==3.5.4
|
75 |
+
tensorboard-data-server==0.6.1
|
76 |
+
tensorboard-plugin-wit==1.8.0
|
77 |
+
tensorboard==2.5.0
|
78 |
+
tensorflow-estimator==2.5.0
|
79 |
+
tensorflow==2.5.0
|
80 |
+
termcolor==1.1.0
|
81 |
+
tokenizers==0.10.3
|
82 |
+
toolz==0.11.1
|
83 |
+
tqdm==4.61.2
|
84 |
+
transformers==4.9.0.dev0
|
85 |
+
typing-extensions==3.7.4.3
|
86 |
+
urllib3==1.26.6
|
87 |
+
wandb==0.10.33
|
88 |
+
werkzeug==2.0.1
|
89 |
+
wheel==0.36.2
|
90 |
+
wrapt==1.12.1
|
91 |
+
xxhash==2.0.2
|
92 |
+
yarl==1.6.3
|
wandb/run-20210713_005751-1wnn0lyf/files/wandb-metadata.json
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
|
3 |
+
"python": "3.8.10",
|
4 |
+
"heartbeatAt": "2021-07-13T00:57:53.965536",
|
5 |
+
"startedAt": "2021-07-13T00:57:51.918634",
|
6 |
+
"docker": null,
|
7 |
+
"cpu_count": 96,
|
8 |
+
"cuda": null,
|
9 |
+
"args": [
|
10 |
+
"--push_to_hub",
|
11 |
+
"--output_dir=./",
|
12 |
+
"--model_type=big_bird",
|
13 |
+
"--config_name=./",
|
14 |
+
"--tokenizer_name=./",
|
15 |
+
"--max_seq_length=4096",
|
16 |
+
"--weight_decay=0.0095",
|
17 |
+
"--warmup_steps=5000",
|
18 |
+
"--overwrite_output_dir",
|
19 |
+
"--adam_beta1=0.9",
|
20 |
+
"--adam_beta2=0.98",
|
21 |
+
"--logging_steps=500",
|
22 |
+
"--eval_steps=92768",
|
23 |
+
"--num_train_epochs=5",
|
24 |
+
"--preprocessing_num_workers=64",
|
25 |
+
"--save_steps=20000",
|
26 |
+
"--learning_rate=5e-5",
|
27 |
+
"--per_device_train_batch_size=4",
|
28 |
+
"--per_device_eval_batch_size=4",
|
29 |
+
"--save_total_limit=5",
|
30 |
+
"--dtype=bfloat16"
|
31 |
+
],
|
32 |
+
"state": "running",
|
33 |
+
"program": "./run_mlm_flax.py",
|
34 |
+
"codePath": "run_mlm_flax.py",
|
35 |
+
"git": {
|
36 |
+
"remote": "https://huggingface.co/flax-community/pino-roberta-base",
|
37 |
+
"commit": "4229c91b780cf07115cc6d04c16e393b0d2f508c"
|
38 |
+
},
|
39 |
+
"email": null,
|
40 |
+
"root": "/home/dat/pino-roberta-base",
|
41 |
+
"host": "t1v-n-f5c06ea1-w-0",
|
42 |
+
"username": "dat",
|
43 |
+
"executable": "/home/dat/pino/bin/python"
|
44 |
+
}
|
wandb/run-20210713_005751-1wnn0lyf/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{}
|
wandb/run-20210713_005751-1wnn0lyf/logs/debug-internal.log
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2021-07-13 00:57:52,645 INFO MainThread:329334 [internal.py:wandb_internal():88] W&B internal server running at pid: 329334, started at: 2021-07-13 00:57:52.644860
|
2 |
+
2021-07-13 00:57:52,647 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: check_version
|
3 |
+
2021-07-13 00:57:52,647 INFO WriterThread:329334 [datastore.py:open_for_write():80] open: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/run-1wnn0lyf.wandb
|
4 |
+
2021-07-13 00:57:52,648 DEBUG SenderThread:329334 [sender.py:send():179] send: header
|
5 |
+
2021-07-13 00:57:52,648 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: check_version
|
6 |
+
2021-07-13 00:57:52,687 DEBUG SenderThread:329334 [sender.py:send():179] send: run
|
7 |
+
2021-07-13 00:57:52,862 INFO SenderThread:329334 [dir_watcher.py:__init__():168] watching files in: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files
|
8 |
+
2021-07-13 00:57:52,862 INFO SenderThread:329334 [sender.py:_start_run_threads():716] run started: 1wnn0lyf with start time 1626137872
|
9 |
+
2021-07-13 00:57:52,862 DEBUG SenderThread:329334 [sender.py:send():179] send: summary
|
10 |
+
2021-07-13 00:57:52,862 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: run_start
|
11 |
+
2021-07-13 00:57:52,863 INFO SenderThread:329334 [sender.py:_save_file():841] saving file wandb-summary.json with policy end
|
12 |
+
2021-07-13 00:57:53,865 INFO Thread-8 :329334 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/wandb-summary.json
|
13 |
+
2021-07-13 00:57:53,965 DEBUG HandlerThread:329334 [meta.py:__init__():39] meta init
|
14 |
+
2021-07-13 00:57:53,965 DEBUG HandlerThread:329334 [meta.py:__init__():53] meta init done
|
15 |
+
2021-07-13 00:57:53,965 DEBUG HandlerThread:329334 [meta.py:probe():210] probe
|
16 |
+
2021-07-13 00:57:53,966 DEBUG HandlerThread:329334 [meta.py:_setup_git():200] setup git
|
17 |
+
2021-07-13 00:57:53,996 DEBUG HandlerThread:329334 [meta.py:_setup_git():207] setup git done
|
18 |
+
2021-07-13 00:57:53,996 DEBUG HandlerThread:329334 [meta.py:_save_pip():57] save pip
|
19 |
+
2021-07-13 00:57:53,996 DEBUG HandlerThread:329334 [meta.py:_save_pip():71] save pip done
|
20 |
+
2021-07-13 00:57:53,996 DEBUG HandlerThread:329334 [meta.py:probe():252] probe done
|
21 |
+
2021-07-13 00:57:53,999 DEBUG SenderThread:329334 [sender.py:send():179] send: files
|
22 |
+
2021-07-13 00:57:53,999 INFO SenderThread:329334 [sender.py:_save_file():841] saving file wandb-metadata.json with policy now
|
23 |
+
2021-07-13 00:57:54,007 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
|
24 |
+
2021-07-13 00:57:54,007 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
|
25 |
+
2021-07-13 00:57:54,134 DEBUG SenderThread:329334 [sender.py:send():179] send: config
|
26 |
+
2021-07-13 00:57:54,135 DEBUG SenderThread:329334 [sender.py:send():179] send: config
|
27 |
+
2021-07-13 00:57:54,135 DEBUG SenderThread:329334 [sender.py:send():179] send: config
|
28 |
+
2021-07-13 00:57:54,460 INFO Thread-11 :329334 [upload_job.py:push():137] Uploaded file /tmp/tmpbiuftyldwandb/b3fet9y4-wandb-metadata.json
|
29 |
+
2021-07-13 00:57:54,864 INFO Thread-8 :329334 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/output.log
|
30 |
+
2021-07-13 00:57:54,864 INFO Thread-8 :329334 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/wandb-metadata.json
|
31 |
+
2021-07-13 00:57:54,864 INFO Thread-8 :329334 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/requirements.txt
|
32 |
+
2021-07-13 00:58:09,136 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
|
33 |
+
2021-07-13 00:58:09,136 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
|
34 |
+
2021-07-13 00:58:10,870 INFO Thread-8 :329334 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/output.log
|
35 |
+
2021-07-13 00:58:22,050 DEBUG SenderThread:329334 [sender.py:send():179] send: stats
|
36 |
+
2021-07-13 00:58:23,875 INFO Thread-8 :329334 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/config.yaml
|
37 |
+
2021-07-13 00:58:24,269 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
|
38 |
+
2021-07-13 00:58:24,269 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
|
39 |
+
2021-07-13 00:58:39,402 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
|
40 |
+
2021-07-13 00:58:39,403 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
|
41 |
+
2021-07-13 00:58:52,130 DEBUG SenderThread:329334 [sender.py:send():179] send: stats
|
42 |
+
2021-07-13 00:58:54,537 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
|
43 |
+
2021-07-13 00:58:54,537 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
|
44 |
+
2021-07-13 00:59:00,888 INFO Thread-8 :329334 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/files/output.log
|
45 |
+
2021-07-13 00:59:09,682 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
|
46 |
+
2021-07-13 00:59:09,683 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
|
47 |
+
2021-07-13 00:59:22,209 DEBUG SenderThread:329334 [sender.py:send():179] send: stats
|
48 |
+
2021-07-13 00:59:24,837 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
|
49 |
+
2021-07-13 00:59:24,837 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
|
50 |
+
2021-07-13 00:59:39,971 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
|
51 |
+
2021-07-13 00:59:39,971 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
|
52 |
+
2021-07-13 00:59:52,289 DEBUG SenderThread:329334 [sender.py:send():179] send: stats
|
53 |
+
2021-07-13 00:59:55,105 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
|
54 |
+
2021-07-13 00:59:55,105 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
|
55 |
+
2021-07-13 01:00:10,158 WARNING MainThread:329334 [internal.py:wandb_internal():147] Internal process interrupt: 1
|
56 |
+
2021-07-13 01:00:10,246 DEBUG HandlerThread:329334 [handler.py:handle_request():124] handle_request: stop_status
|
57 |
+
2021-07-13 01:00:10,246 DEBUG SenderThread:329334 [sender.py:send_request():193] send_request: stop_status
|
58 |
+
2021-07-13 01:00:11,893 WARNING MainThread:329334 [internal.py:wandb_internal():147] Internal process interrupt: 2
|
59 |
+
2021-07-13 01:00:11,893 ERROR MainThread:329334 [internal.py:wandb_internal():150] Internal process interrupted.
|
60 |
+
2021-07-13 01:00:12,253 INFO HandlerThread:329334 [handler.py:finish():638] shutting down handler
|
61 |
+
2021-07-13 01:00:12,281 INFO MainThread:329334 [internal.py:handle_exit():78] Internal process exited
|
wandb/run-20210713_005751-1wnn0lyf/logs/debug.log
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_setup.py:_flush():69] setting env: {}
|
2 |
+
2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_setup.py:_flush():69] setting login settings: {}
|
3 |
+
2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_init.py:_log_setup():337] Logging user logs to /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/logs/debug.log
|
4 |
+
2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_init.py:_log_setup():338] Logging internal logs to /home/dat/pino-roberta-base/wandb/run-20210713_005751-1wnn0lyf/logs/debug-internal.log
|
5 |
+
2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_init.py:init():370] calling init triggers
|
6 |
+
2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_init.py:init():375] wandb.init called with sweep_config: {}
|
7 |
+
config: {}
|
8 |
+
2021-07-13 00:57:51,920 INFO MainThread:327810 [wandb_init.py:init():419] starting backend
|
9 |
+
2021-07-13 00:57:51,920 INFO MainThread:327810 [backend.py:_multiprocessing_setup():70] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
10 |
+
2021-07-13 00:57:51,997 INFO MainThread:327810 [backend.py:ensure_launched():135] starting backend process...
|
11 |
+
2021-07-13 00:57:52,047 INFO MainThread:327810 [backend.py:ensure_launched():139] started backend process with pid: 329334
|
12 |
+
2021-07-13 00:57:52,050 INFO MainThread:327810 [wandb_init.py:init():424] backend started and connected
|
13 |
+
2021-07-13 00:57:52,053 INFO MainThread:327810 [wandb_init.py:init():472] updated telemetry
|
14 |
+
2021-07-13 00:57:52,054 INFO MainThread:327810 [wandb_init.py:init():491] communicating current version
|
15 |
+
2021-07-13 00:57:52,686 INFO MainThread:327810 [wandb_init.py:init():496] got version response
|
16 |
+
2021-07-13 00:57:52,686 INFO MainThread:327810 [wandb_init.py:init():504] communicating run to backend with 30 second timeout
|
17 |
+
2021-07-13 00:57:52,861 INFO MainThread:327810 [wandb_init.py:init():529] starting run threads in backend
|
18 |
+
2021-07-13 00:57:54,003 INFO MainThread:327810 [wandb_run.py:_console_start():1623] atexit reg
|
19 |
+
2021-07-13 00:57:54,004 INFO MainThread:327810 [wandb_run.py:_redirect():1497] redirect: SettingsConsole.REDIRECT
|
20 |
+
2021-07-13 00:57:54,004 INFO MainThread:327810 [wandb_run.py:_redirect():1502] Redirecting console.
|
21 |
+
2021-07-13 00:57:54,006 INFO MainThread:327810 [wandb_run.py:_redirect():1558] Redirects installed.
|
22 |
+
2021-07-13 00:57:54,006 INFO MainThread:327810 [wandb_init.py:init():554] run started, returning control to user process
|
23 |
+
2021-07-13 00:57:54,012 INFO MainThread:327810 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 5000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul13_00-57-01_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 500, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 20000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 92768, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''}
|
24 |
+
2021-07-13 00:57:54,014 INFO MainThread:327810 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
|
25 |
+
2021-07-13 00:57:54,016 INFO MainThread:327810 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 64, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False}
|
26 |
+
2021-07-13 01:00:22,944 INFO MainThread:327810 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 255
|
27 |
+
2021-07-13 01:00:22,945 INFO MainThread:327810 [wandb_run.py:_restore():1565] restore
|
28 |
+
2021-07-13 01:00:25,397 INFO MainThread:327810 [wandb_run.py:_restore():1565] restore
|
wandb/run-20210713_005751-1wnn0lyf/run-1wnn0lyf.wandb
ADDED
Binary file (3.98 kB). View file
|
|