diff --git a/checkpoint_105000 b/checkpoint_105000 new file mode 100644 index 0000000000000000000000000000000000000000..01fba4e8bae60310487a71329f8e391f9d0541d0 --- /dev/null +++ b/checkpoint_105000 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd3729e2e9d09e233e2f4dfecac9dc1888f374b3614c8a092c1bc958fdab2ccf +size 1530270447 diff --git a/events.out.tfevents.1626362977.t1v-n-f5c06ea1-w-0.707091.3.v2 b/events.out.tfevents.1626362977.t1v-n-f5c06ea1-w-0.707091.3.v2 index 08728495c0aeeeead40b641231c8feb8ae8feac8..6d2a9d5024a48be5d568a1a27ddd5d5ab18449ac 100644 --- a/events.out.tfevents.1626362977.t1v-n-f5c06ea1-w-0.707091.3.v2 +++ b/events.out.tfevents.1626362977.t1v-n-f5c06ea1-w-0.707091.3.v2 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29705d778db21abb9e80a3472189629d86cd8247cacda98d58b20a30bd684e63 -size 1484145 +oid sha256:ff143b1f5efcedbe6cb99342667f8e6e2c855c4b038a867b719689a9052d49f9 +size 1491633 diff --git a/events.out.tfevents.1626368154.t1v-n-f5c06ea1-w-0.715071.3.v2 b/events.out.tfevents.1626368154.t1v-n-f5c06ea1-w-0.715071.3.v2 new file mode 100644 index 0000000000000000000000000000000000000000..00de2c98db411bac42ec3691f017a42acf2140c2 --- /dev/null +++ b/events.out.tfevents.1626368154.t1v-n-f5c06ea1-w-0.715071.3.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9278085ac62dbd79723fd51fc60c6044bcc9f3786ff81b54b11a5f5e9cdd4a8d +size 44508 diff --git a/events.out.tfevents.1626369005.t1v-n-f5c06ea1-w-0.717656.3.v2 b/events.out.tfevents.1626369005.t1v-n-f5c06ea1-w-0.717656.3.v2 new file mode 100644 index 0000000000000000000000000000000000000000..fd39b746a9949c39031121ec6b2298a079f2a8d3 --- /dev/null +++ b/events.out.tfevents.1626369005.t1v-n-f5c06ea1-w-0.717656.3.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11918a96e9d622df1b0d2f1b7999d21bf87fd9b7b7c937dcecd680857aacdd68 +size 437715 diff --git a/events.out.tfevents.1626370906.t1v-n-f5c06ea1-w-0.721922.3.v2 b/events.out.tfevents.1626370906.t1v-n-f5c06ea1-w-0.721922.3.v2 new file mode 100644 index 0000000000000000000000000000000000000000..0fa16c46c7b109e1ba2c9d3e6301ebee3dc2153e --- /dev/null +++ b/events.out.tfevents.1626370906.t1v-n-f5c06ea1-w-0.721922.3.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eebb8a50d7001a4d23af995647a2f73df34f476a7489826b8dee64940fa1597 +size 40 diff --git a/events.out.tfevents.1626371506.t1v-n-f5c06ea1-w-0.724375.3.v2 b/events.out.tfevents.1626371506.t1v-n-f5c06ea1-w-0.724375.3.v2 new file mode 100644 index 0000000000000000000000000000000000000000..0477e2071838f775fa80e3c17f09819fe013ff5a --- /dev/null +++ b/events.out.tfevents.1626371506.t1v-n-f5c06ea1-w-0.724375.3.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e90de36468e161351a0990822bb3945c86e791570c1c95ee000e62bcdd64e7d6 +size 14886 diff --git a/events.out.tfevents.1626372294.t1v-n-f5c06ea1-w-0.727475.3.v2 b/events.out.tfevents.1626372294.t1v-n-f5c06ea1-w-0.727475.3.v2 new file mode 100644 index 0000000000000000000000000000000000000000..0c27bb06f3e3618bbd50a86f7a2689d13c7989e7 --- /dev/null +++ b/events.out.tfevents.1626372294.t1v-n-f5c06ea1-w-0.727475.3.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b6b8f2fe86e2ccf16382846366ee00347b75c8b502be2e423a846260c6bae4f +size 40 diff --git a/events.out.tfevents.1626374797.t1v-n-f5c06ea1-w-0.731696.3.v2 b/events.out.tfevents.1626374797.t1v-n-f5c06ea1-w-0.731696.3.v2 new file mode 100644 index 0000000000000000000000000000000000000000..e066ee213706a29bedd599d3be78b5f351dcfae9 --- /dev/null +++ b/events.out.tfevents.1626374797.t1v-n-f5c06ea1-w-0.731696.3.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f583930fdc45029a3ba0585014533fd8ef9a4c87a21651e280f8759447faca8f +size 37561 diff --git a/events.out.tfevents.1626375524.t1v-n-f5c06ea1-w-0.734136.3.v2 b/events.out.tfevents.1626375524.t1v-n-f5c06ea1-w-0.734136.3.v2 new file mode 100644 index 0000000000000000000000000000000000000000..a666000720bbd33b4447fbf26ee7db00cbc381f2 --- /dev/null +++ b/events.out.tfevents.1626375524.t1v-n-f5c06ea1-w-0.734136.3.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42934a5a9a64204b53646673742f1b60cb101307e03f942f13e21a364e9e5ac7 +size 752033 diff --git a/flax_model.msgpack b/flax_model.msgpack index e81476eddf42c43998dad1b1cdceeb60cc03d235..1d32bd8a3b8c7133b836a1b5b244ebba062e53f4 100644 --- a/flax_model.msgpack +++ b/flax_model.msgpack @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b89d2017401b78b9121ca751668fd0246d3593be3d565ee5e7b06d3829e4ec6a +oid sha256:e40ad9ce0279ffbd95b20a8774cb1469a1b0a3a27fe354529522addece034982 size 510090043 diff --git a/run.sh b/run.sh index 5da21a097d9113eed4b5d9cf2209b2e6566bc1ed..503c39dd8ba740aa733e0df7987c8fb61ad06012 100644 --- a/run.sh +++ b/run.sh @@ -15,14 +15,14 @@ python ./run_mlm_flax_no_accum.py \ --adam_beta1="0.9" \ --adam_beta2="0.98" \ --logging_steps="50" \ - --eval_steps="3000" \ + --eval_steps="6000" \ --num_train_epochs="2"\ --preprocessing_num_workers="96" \ - --save_steps="10000" \ + --save_steps="15000" \ --learning_rate="3e-5" \ --per_device_train_batch_size="1" \ --per_device_eval_batch_size="1" \ - --save_total_limit="5"\ + --save_total_limit="20"\ --max_eval_samples="4000"\ --resume_from_checkpoint="./"\ #--gradient_accumulation_steps="4"\ diff --git a/run_mlm_flax_no_accum.py b/run_mlm_flax_no_accum.py index 891defefe582ff8cd4b2bd653dfbe26ca6bca942..3ccadd8a883b70fea7a963a37ec7e8d604b16e94 100644 --- a/run_mlm_flax_no_accum.py +++ b/run_mlm_flax_no_accum.py @@ -689,9 +689,9 @@ if __name__ == "__main__": num_train_samples = len(tokenized_datasets["train"]) train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples)) train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size) - + hooks = [] # Gather the indexes for creating the batch and do a training step - for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1,initial=resume_step)): + for step, batch_idx in tqdm(enumerate(train_batch_idx,start=resume_step), desc="Training...", position=1): samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx] model_inputs = data_collator(samples, pad_to_multiple_of=16) @@ -699,8 +699,10 @@ if __name__ == "__main__": model_inputs = shard(model_inputs.data) state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs) train_metrics.append(train_metric) - + cur_step = epoch * (num_train_samples // train_batch_size) + step + if cur_step == resume_step: + logging.info('Initial compilation completed.') #if cur_step < resume_step: # continue diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log index 2b4a7c7b8041e8793e52fae44e13fe45122b699c..adc3116f306257dd831935635e8f6247f15ccdbb 120000 --- a/wandb/debug-internal.log +++ b/wandb/debug-internal.log @@ -1 +1 @@ -run-20210715_152938-8qznp93p/logs/debug-internal.log \ No newline at end of file +run-20210715_185845-dq8uirtg/logs/debug-internal.log \ No newline at end of file diff --git a/wandb/debug.log b/wandb/debug.log index 939fd45558144d36e173523d86341743b912df53..ee4172bc4c6b684fbab100cc8961c856191d5238 120000 --- a/wandb/debug.log +++ b/wandb/debug.log @@ -1 +1 @@ -run-20210715_152938-8qznp93p/logs/debug.log \ No newline at end of file +run-20210715_185845-dq8uirtg/logs/debug.log \ No newline at end of file diff --git a/wandb/latest-run b/wandb/latest-run index 6b87fa6da51acea75a384a92553443ca3019c86d..b5d01191310b128b6efcf1c27fe5f8b30061fc97 120000 --- a/wandb/latest-run +++ b/wandb/latest-run @@ -1 +1 @@ -run-20210715_152938-8qznp93p \ No newline at end of file +run-20210715_185845-dq8uirtg \ No newline at end of file diff --git a/wandb/run-20210715_152938-8qznp93p/files/output.log b/wandb/run-20210715_152938-8qznp93p/files/output.log index 049e5ae1f92bf7634d6d877a7ba89b362e2597c8..d5fdbcc98a1246eb1305e709284424c170044652 100644 --- a/wandb/run-20210715_152938-8qznp93p/files/output.log +++ b/wandb/run-20210715_152938-8qznp93p/files/output.log @@ -1220,3 +1220,35 @@ tcmalloc: large alloc 1530273792 bytes == 0x31850a000 @ 0x7f3586844680 0x7f3586 [16:50:06] - INFO - absl - Saved checkpoint at checkpoint_10000 [16:50:07] - INFO - huggingface_hub.repository - git version 2.25.1 git-lfs/2.9.2 (GitHub; linux amd64; go 1.13.5) +[16:50:07] - DEBUG - huggingface_hub.repository - [Repository] is a valid git repo +[16:51:22] - INFO - huggingface_hub.repository - Uploading LFS objects: 100% (5/5), 2.0 GB | 43 MB/s, done. + + + +Training...: 28%|███████████████████████████████████████▉ | 100051/352766 [1:16:06<25:42:07, 2.73it/s] + + + + +Step... (9000 | Loss: 2.3799679279327393, Acc: 0.5589502453804016): 0%| | 0/2 [1:22:05 + write_train_metric(summary_writer, train_metrics, train_time, cur_step) + File "./run_mlm_flax_no_accum.py", line 263, in write_train_metric + train_metrics = get_metrics(train_metrics) + File "/home/dat/pino/lib/python3.8/site-packages/flax/training/common_utils.py", line 52, in get_metrics + metrics_np = jax.device_get(device_metrics) + File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/api.py", line 2337, in device_get + return tree_map(_device_get, x) + File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/tree_util.py", line 168, in tree_map + return treedef.unflatten(f(*xs) for xs in zip(*all_leaves)) + File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/tree_util.py", line 168, in + return treedef.unflatten(f(*xs) for xs in zip(*all_leaves)) + File "/home/dat/pino/lib/python3.8/site-packages/jax/_src/api.py", line 2329, in _device_get + return copy() + File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 1221, in copy + return np.asarray(self) + File "/home/dat/pino/lib/python3.8/site-packages/numpy/core/_asarray.py", line 83, in asarray + return array(a, dtype, copy=False, order=order) + File "/home/dat/pino/lib/python3.8/site-packages/jax/interpreters/xla.py", line 1286, in __array__ + return np.asarray(self._value, dtype=dtype) diff --git a/wandb/run-20210715_152938-8qznp93p/files/wandb-summary.json b/wandb/run-20210715_152938-8qznp93p/files/wandb-summary.json index 83660311ced5e2f1332bb4a5609aab174f188d83..18c61e8612d881b6f9bda6eeb82c861cb72e2b24 100644 --- a/wandb/run-20210715_152938-8qznp93p/files/wandb-summary.json +++ b/wandb/run-20210715_152938-8qznp93p/files/wandb-summary.json @@ -1 +1 @@ -{"training_step": 10000, "learning_rate": 2.6118033929378726e-05, "train_loss": 2.5166258811950684, "_runtime": 4818, "_timestamp": 1626367796, "_step": 202, "eval_step": 9000, "eval_accuracy": 0.5589502453804016, "eval_loss": 2.3799679279327393} \ No newline at end of file +{"training_step": 10050, "learning_rate": 2.6115878426935524e-05, "train_loss": 2.5157060623168945, "_runtime": 4924, "_timestamp": 1626367902, "_step": 203, "eval_step": 9000, "eval_accuracy": 0.5589502453804016, "eval_loss": 2.3799679279327393} \ No newline at end of file diff --git a/wandb/run-20210715_152938-8qznp93p/logs/debug-internal.log b/wandb/run-20210715_152938-8qznp93p/logs/debug-internal.log index ef9603accc10bb20951bdb7666efdc88dda7fb7d..0841290a5b4d1e7f8477623c55c59347572bbcbf 100644 --- a/wandb/run-20210715_152938-8qznp93p/logs/debug-internal.log +++ b/wandb/run-20210715_152938-8qznp93p/logs/debug-internal.log @@ -2791,3 +2791,52 @@ 2021-07-15 16:50:16,636 DEBUG HandlerThread:708348 [handler.py:handle_request():124] handle_request: stop_status 2021-07-15 16:50:16,637 DEBUG SenderThread:708348 [sender.py:send_request():193] send_request: stop_status 2021-07-15 16:50:20,859 DEBUG SenderThread:708348 [sender.py:send():179] send: stats +2021-07-15 16:50:31,769 DEBUG HandlerThread:708348 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 16:50:31,770 DEBUG SenderThread:708348 [sender.py:send_request():193] send_request: stop_status +2021-07-15 16:50:46,904 DEBUG HandlerThread:708348 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 16:50:46,904 DEBUG SenderThread:708348 [sender.py:send_request():193] send_request: stop_status +2021-07-15 16:50:50,940 DEBUG SenderThread:708348 [sender.py:send():179] send: stats +2021-07-15 16:51:02,034 DEBUG HandlerThread:708348 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 16:51:02,035 DEBUG SenderThread:708348 [sender.py:send_request():193] send_request: stop_status +2021-07-15 16:51:17,167 DEBUG HandlerThread:708348 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 16:51:17,167 DEBUG SenderThread:708348 [sender.py:send_request():193] send_request: stop_status +2021-07-15 16:51:21,017 DEBUG SenderThread:708348 [sender.py:send():179] send: stats +2021-07-15 16:51:23,329 INFO Thread-8 :708348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/output.log +2021-07-15 16:51:25,330 INFO Thread-8 :708348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/output.log +2021-07-15 16:51:27,330 INFO Thread-8 :708348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/output.log +2021-07-15 16:51:29,331 INFO Thread-8 :708348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/output.log +2021-07-15 16:51:31,332 INFO Thread-8 :708348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/output.log +2021-07-15 16:51:32,298 DEBUG HandlerThread:708348 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 16:51:32,298 DEBUG SenderThread:708348 [sender.py:send_request():193] send_request: stop_status +2021-07-15 16:51:42,885 DEBUG SenderThread:708348 [sender.py:send():179] send: history +2021-07-15 16:51:42,886 DEBUG SenderThread:708348 [sender.py:send():179] send: summary +2021-07-15 16:51:42,886 INFO SenderThread:708348 [sender.py:_save_file():841] saving file wandb-summary.json with policy end +2021-07-15 16:51:43,337 INFO Thread-8 :708348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/wandb-summary.json +2021-07-15 16:51:45,338 INFO Thread-8 :708348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/output.log +2021-07-15 16:51:47,339 INFO Thread-8 :708348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/output.log +2021-07-15 16:51:47,437 DEBUG HandlerThread:708348 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 16:51:47,438 DEBUG SenderThread:708348 [sender.py:send_request():193] send_request: stop_status +2021-07-15 16:51:49,339 INFO Thread-8 :708348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/output.log +2021-07-15 16:51:51,095 DEBUG SenderThread:708348 [sender.py:send():179] send: stats +2021-07-15 16:51:51,340 INFO Thread-8 :708348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/output.log +2021-07-15 16:52:01,993 WARNING MainThread:708348 [internal.py:wandb_internal():147] Internal process interrupt: 1 +2021-07-15 16:52:03,090 WARNING MainThread:708348 [internal.py:wandb_internal():147] Internal process interrupt: 2 +2021-07-15 16:52:03,091 ERROR MainThread:708348 [internal.py:wandb_internal():150] Internal process interrupted. +2021-07-15 16:52:03,345 INFO Thread-8 :708348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/output.log +2021-07-15 16:52:03,763 INFO SenderThread:708348 [sender.py:finish():945] shutting down sender +2021-07-15 16:52:03,763 INFO WriterThread:708348 [datastore.py:close():288] close: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/run-8qznp93p.wandb +2021-07-15 16:52:03,763 INFO SenderThread:708348 [dir_watcher.py:finish():282] shutting down directory watcher +2021-07-15 16:52:03,764 INFO HandlerThread:708348 [handler.py:finish():638] shutting down handler +2021-07-15 16:52:04,346 INFO SenderThread:708348 [dir_watcher.py:finish():312] scan: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files +2021-07-15 16:52:04,346 INFO SenderThread:708348 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/requirements.txt requirements.txt +2021-07-15 16:52:04,346 INFO SenderThread:708348 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/output.log output.log +2021-07-15 16:52:04,346 INFO SenderThread:708348 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/wandb-metadata.json wandb-metadata.json +2021-07-15 16:52:04,346 INFO SenderThread:708348 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/config.yaml config.yaml +2021-07-15 16:52:04,346 INFO SenderThread:708348 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/wandb-summary.json wandb-summary.json +2021-07-15 16:52:04,347 INFO SenderThread:708348 [file_pusher.py:finish():177] shutting down file pusher +2021-07-15 16:52:04,347 INFO SenderThread:708348 [file_pusher.py:join():182] waiting for file pusher +2021-07-15 16:52:04,799 INFO Thread-15 :708348 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/wandb-summary.json +2021-07-15 16:52:04,811 INFO Thread-14 :708348 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/config.yaml +2021-07-15 16:52:04,820 INFO Thread-13 :708348 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/output.log +2021-07-15 16:52:04,835 INFO Thread-12 :708348 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210715_152938-8qznp93p/files/requirements.txt +2021-07-15 16:52:05,617 INFO MainThread:708348 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20210715_152938-8qznp93p/logs/debug.log b/wandb/run-20210715_152938-8qznp93p/logs/debug.log index f5cbdcd66e9e6ec60d8a13fc1a20079f1c4a9d73..5cd18f89aa0cdd228ce23982095fae8470986665 100644 --- a/wandb/run-20210715_152938-8qznp93p/logs/debug.log +++ b/wandb/run-20210715_152938-8qznp93p/logs/debug.log @@ -23,3 +23,5 @@ config: {} 2021-07-15 15:29:40,498 INFO MainThread:707091 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 3e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 2.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 10000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul15_15-29-30_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 50, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 10000, 'save_total_limit': 5, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 3000, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': './', 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''} 2021-07-15 15:29:40,500 INFO MainThread:707091 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'float32'} 2021-07-15 15:29:40,501 INFO MainThread:707091 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 96, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False, 'max_eval_samples': 4000} +2021-07-15 16:52:02,189 INFO MainThread:707091 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 255 +2021-07-15 16:52:02,189 INFO MainThread:707091 [wandb_run.py:_restore():1565] restore diff --git a/wandb/run-20210715_152938-8qznp93p/run-8qznp93p.wandb b/wandb/run-20210715_152938-8qznp93p/run-8qznp93p.wandb index a2f89aa0e1aee829b3a70d7ef41e60939ef3d9e1..060e81d3712be349b7424beffd66cd541d211925 100644 Binary files a/wandb/run-20210715_152938-8qznp93p/run-8qznp93p.wandb and b/wandb/run-20210715_152938-8qznp93p/run-8qznp93p.wandb differ diff --git a/wandb/run-20210715_165555-25rtfw59/files/config.yaml b/wandb/run-20210715_165555-25rtfw59/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..313f768d6e7126e4428591d58005d38e18b5796f --- /dev/null +++ b/wandb/run-20210715_165555-25rtfw59/files/config.yaml @@ -0,0 +1,301 @@ +wandb_version: 1 + +_wandb: + desc: null + value: + cli_version: 0.10.33 + framework: huggingface + huggingface_version: 4.9.0.dev0 + is_jupyter_run: false + is_kaggle_kernel: false + python_version: 3.8.10 + t: + 1: + - 3 + - 11 + 4: 3.8.10 + 5: 0.10.33 + 6: 4.9.0.dev0 + 8: + - 5 +adafactor: + desc: null + value: false +adam_beta1: + desc: null + value: 0.9 +adam_beta2: + desc: null + value: 0.98 +adam_epsilon: + desc: null + value: 1.0e-08 +cache_dir: + desc: null + value: null +config_name: + desc: null + value: ./ +dataloader_drop_last: + desc: null + value: false +dataloader_num_workers: + desc: null + value: 0 +dataloader_pin_memory: + desc: null + value: true +dataset_config_name: + desc: null + value: null +dataset_name: + desc: null + value: null +ddp_find_unused_parameters: + desc: null + value: null +debug: + desc: null + value: [] +deepspeed: + desc: null + value: null +disable_tqdm: + desc: null + value: false +do_eval: + desc: null + value: false +do_predict: + desc: null + value: false +do_train: + desc: null + value: false +dtype: + desc: null + value: float32 +eval_accumulation_steps: + desc: null + value: null +eval_steps: + desc: null + value: 6000 +evaluation_strategy: + desc: null + value: IntervalStrategy.NO +fp16: + desc: null + value: false +fp16_backend: + desc: null + value: auto +fp16_full_eval: + desc: null + value: false +fp16_opt_level: + desc: null + value: O1 +gradient_accumulation_steps: + desc: null + value: 1 +greater_is_better: + desc: null + value: null +group_by_length: + desc: null + value: false +ignore_data_skip: + desc: null + value: false +label_names: + desc: null + value: null +label_smoothing_factor: + desc: null + value: 0.0 +learning_rate: + desc: null + value: 3.0e-05 +length_column_name: + desc: null + value: length +line_by_line: + desc: null + value: false +load_best_model_at_end: + desc: null + value: false +local_rank: + desc: null + value: -1 +log_level: + desc: null + value: -1 +log_level_replica: + desc: null + value: -1 +log_on_each_node: + desc: null + value: true +logging_dir: + desc: null + value: ./runs/Jul15_16-55-47_t1v-n-f5c06ea1-w-0 +logging_first_step: + desc: null + value: false +logging_steps: + desc: null + value: 50 +logging_strategy: + desc: null + value: IntervalStrategy.STEPS +lr_scheduler_type: + desc: null + value: SchedulerType.LINEAR +max_eval_samples: + desc: null + value: 4000 +max_grad_norm: + desc: null + value: 1.0 +max_seq_length: + desc: null + value: 4096 +max_steps: + desc: null + value: -1 +metric_for_best_model: + desc: null + value: null +mlm_probability: + desc: null + value: 0.15 +model_name_or_path: + desc: null + value: null +model_type: + desc: null + value: big_bird +mp_parameters: + desc: null + value: '' +no_cuda: + desc: null + value: false +num_train_epochs: + desc: null + value: 2.0 +output_dir: + desc: null + value: ./ +overwrite_cache: + desc: null + value: false +overwrite_output_dir: + desc: null + value: true +pad_to_max_length: + desc: null + value: false +past_index: + desc: null + value: -1 +per_device_eval_batch_size: + desc: null + value: 1 +per_device_train_batch_size: + desc: null + value: 1 +per_gpu_eval_batch_size: + desc: null + value: null +per_gpu_train_batch_size: + desc: null + value: null +prediction_loss_only: + desc: null + value: false +preprocessing_num_workers: + desc: null + value: 96 +push_to_hub: + desc: null + value: true +push_to_hub_model_id: + desc: null + value: '' +push_to_hub_organization: + desc: null + value: null +push_to_hub_token: + desc: null + value: null +remove_unused_columns: + desc: null + value: true +report_to: + desc: null + value: + - tensorboard + - wandb +resume_from_checkpoint: + desc: null + value: ./ +run_name: + desc: null + value: ./ +save_on_each_node: + desc: null + value: false +save_steps: + desc: null + value: 15000 +save_strategy: + desc: null + value: IntervalStrategy.STEPS +save_total_limit: + desc: null + value: 20 +seed: + desc: null + value: 42 +sharded_ddp: + desc: null + value: [] +skip_memory_metrics: + desc: null + value: true +tokenizer_name: + desc: null + value: ./ +tpu_metrics_debug: + desc: null + value: false +tpu_num_cores: + desc: null + value: null +train_ref_file: + desc: null + value: null +use_fast_tokenizer: + desc: null + value: true +use_legacy_prediction_loop: + desc: null + value: false +validation_ref_file: + desc: null + value: null +validation_split_percentage: + desc: null + value: 5 +warmup_ratio: + desc: null + value: 0.0 +warmup_steps: + desc: null + value: 10000 +weight_decay: + desc: null + value: 0.0095 diff --git a/wandb/run-20210715_165555-25rtfw59/files/output.log b/wandb/run-20210715_165555-25rtfw59/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..947f568734f22883a2079bc34804b8299a06ebb5 --- /dev/null +++ b/wandb/run-20210715_165555-25rtfw59/files/output.log @@ -0,0 +1,48 @@ +[16:56:11] - INFO - absl - Restoring checkpoint from ./checkpoint_10000 +tcmalloc: large alloc 1530273792 bytes == 0x9b046000 @ 0x7f018a0fa680 0x7f018a11b824 0x5b9a14 0x50b2ae 0x50cb1b 0x5a6f17 0x5f3010 0x56fd36 0x568d9a 0x5f5b33 0x56aadf 0x568d9a 0x68cdc7 0x67e161 0x67e1df 0x67e281 0x67e627 0x6b6e62 0x6b71ed 0x7f0189f0f0b3 0x5f96de +/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:386: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code. + warnings.warn( +/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:373: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code. + warnings.warn( +Epoch ... (1/2): 0%| | 0/2 [00:00 + for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1,start=resume_step)): + File "/home/dat/pino/lib/python3.8/site-packages/tqdm/std.py", line 1015, in __init__ + raise ( +tqdm.std.TqdmKeyError: "Unknown argument(s): {'start': 100002}" \ No newline at end of file diff --git a/wandb/run-20210715_174147-3nkn7hxg/files/requirements.txt b/wandb/run-20210715_174147-3nkn7hxg/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..554e3a34bc91714a8462f65295a12e9a04537637 --- /dev/null +++ b/wandb/run-20210715_174147-3nkn7hxg/files/requirements.txt @@ -0,0 +1,94 @@ +absl-py==0.13.0 +aiohttp==3.7.4.post0 +astunparse==1.6.3 +async-timeout==3.0.1 +attrs==21.2.0 +cachetools==4.2.2 +certifi==2021.5.30 +chardet==4.0.0 +charset-normalizer==2.0.1 +chex==0.0.8 +click==8.0.1 +configparser==5.0.2 +cycler==0.10.0 +datasets==1.9.1.dev0 +dill==0.3.4 +dm-tree==0.1.6 +docker-pycreds==0.4.0 +filelock==3.0.12 +flatbuffers==1.12 +flax==0.3.4 +fsspec==2021.7.0 +gast==0.4.0 +gitdb==4.0.7 +gitpython==3.1.18 +google-auth-oauthlib==0.4.4 +google-auth==1.32.1 +google-pasta==0.2.0 +grpcio==1.34.1 +h5py==3.1.0 +huggingface-hub==0.0.12 +idna==3.2 +install==1.3.4 +jax==0.2.17 +jaxlib==0.1.68 +joblib==1.0.1 +keras-nightly==2.5.0.dev2021032900 +keras-preprocessing==1.1.2 +kiwisolver==1.3.1 +libtpu-nightly==0.1.dev20210615 +markdown==3.3.4 +matplotlib==3.4.2 +msgpack==1.0.2 +multidict==5.1.0 +multiprocess==0.70.12.2 +numpy==1.19.5 +oauthlib==3.1.1 +opt-einsum==3.3.0 +optax==0.0.9 +packaging==21.0 +pandas==1.3.0 +pathtools==0.1.2 +pillow==8.3.1 +pip==20.0.2 +pkg-resources==0.0.0 +promise==2.3 +protobuf==3.17.3 +psutil==5.8.0 +pyarrow==4.0.1 +pyasn1-modules==0.2.8 +pyasn1==0.4.8 +pyparsing==2.4.7 +python-dateutil==2.8.1 +pytz==2021.1 +pyyaml==5.4.1 +regex==2021.7.6 +requests-oauthlib==1.3.0 +requests==2.26.0 +rsa==4.7.2 +sacremoses==0.0.45 +scipy==1.7.0 +sentry-sdk==1.3.0 +setuptools==44.0.0 +shortuuid==1.0.1 +six==1.15.0 +smmap==4.0.0 +subprocess32==3.5.4 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.0 +tensorboard==2.5.0 +tensorflow-estimator==2.5.0 +tensorflow==2.5.0 +termcolor==1.1.0 +tokenizers==0.10.3 +toolz==0.11.1 +tqdm==4.61.2 +transformers==4.9.0.dev0 +typing-extensions==3.7.4.3 +urllib3==1.26.6 +wandb==0.10.33 +werkzeug==2.0.1 +wheel==0.36.2 +wrapt==1.12.1 +xxhash==2.0.2 +yarl==1.6.3 \ No newline at end of file diff --git a/wandb/run-20210715_174147-3nkn7hxg/files/wandb-metadata.json b/wandb/run-20210715_174147-3nkn7hxg/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..3eabaaf8bc762c514285acd99537bfa5a730ef86 --- /dev/null +++ b/wandb/run-20210715_174147-3nkn7hxg/files/wandb-metadata.json @@ -0,0 +1,45 @@ +{ + "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29", + "python": "3.8.10", + "heartbeatAt": "2021-07-15T17:41:49.930191", + "startedAt": "2021-07-15T17:41:47.904489", + "docker": null, + "cpu_count": 96, + "cuda": null, + "args": [ + "--push_to_hub", + "--output_dir=./", + "--model_type=big_bird", + "--config_name=./", + "--tokenizer_name=./", + "--max_seq_length=4096", + "--weight_decay=0.0095", + "--warmup_steps=10000", + "--overwrite_output_dir", + "--adam_beta1=0.9", + "--adam_beta2=0.98", + "--logging_steps=50", + "--eval_steps=6000", + "--num_train_epochs=2", + "--preprocessing_num_workers=96", + "--save_steps=15000", + "--learning_rate=3e-5", + "--per_device_train_batch_size=1", + "--per_device_eval_batch_size=1", + "--save_total_limit=20", + "--max_eval_samples=4000", + "--resume_from_checkpoint=./" + ], + "state": "running", + "program": "./run_mlm_flax_no_accum.py", + "codePath": "run_mlm_flax_no_accum.py", + "git": { + "remote": "https://huggingface.co/flax-community/pino-roberta-base", + "commit": "cc569aecf5e26454416d7a13c7876ad9111120cf" + }, + "email": null, + "root": "/home/dat/pino-roberta-base", + "host": "t1v-n-f5c06ea1-w-0", + "username": "dat", + "executable": "/home/dat/pino/bin/python" +} diff --git a/wandb/run-20210715_174147-3nkn7hxg/files/wandb-summary.json b/wandb/run-20210715_174147-3nkn7hxg/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b --- /dev/null +++ b/wandb/run-20210715_174147-3nkn7hxg/files/wandb-summary.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/wandb/run-20210715_174147-3nkn7hxg/logs/debug-internal.log b/wandb/run-20210715_174147-3nkn7hxg/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..f4278b8393f43119c7ff839f4219706bf71346cc --- /dev/null +++ b/wandb/run-20210715_174147-3nkn7hxg/logs/debug-internal.log @@ -0,0 +1,195 @@ +2021-07-15 17:41:48,594 INFO MainThread:723175 [internal.py:wandb_internal():88] W&B internal server running at pid: 723175, started at: 2021-07-15 17:41:48.594473 +2021-07-15 17:41:48,596 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: check_version +2021-07-15 17:41:48,597 INFO WriterThread:723175 [datastore.py:open_for_write():80] open: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/run-3nkn7hxg.wandb +2021-07-15 17:41:48,598 DEBUG SenderThread:723175 [sender.py:send():179] send: header +2021-07-15 17:41:48,598 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: check_version +2021-07-15 17:41:48,634 DEBUG SenderThread:723175 [sender.py:send():179] send: run +2021-07-15 17:41:48,804 INFO SenderThread:723175 [dir_watcher.py:__init__():168] watching files in: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files +2021-07-15 17:41:48,805 INFO SenderThread:723175 [sender.py:_start_run_threads():716] run started: 3nkn7hxg with start time 1626370908 +2021-07-15 17:41:48,805 DEBUG SenderThread:723175 [sender.py:send():179] send: summary +2021-07-15 17:41:48,805 INFO SenderThread:723175 [sender.py:_save_file():841] saving file wandb-summary.json with policy end +2021-07-15 17:41:48,805 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: run_start +2021-07-15 17:41:49,817 INFO Thread-8 :723175 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/wandb-summary.json +2021-07-15 17:41:49,929 DEBUG HandlerThread:723175 [meta.py:__init__():39] meta init +2021-07-15 17:41:49,930 DEBUG HandlerThread:723175 [meta.py:__init__():53] meta init done +2021-07-15 17:41:49,930 DEBUG HandlerThread:723175 [meta.py:probe():210] probe +2021-07-15 17:41:49,931 DEBUG HandlerThread:723175 [meta.py:_setup_git():200] setup git +2021-07-15 17:41:49,960 DEBUG HandlerThread:723175 [meta.py:_setup_git():207] setup git done +2021-07-15 17:41:49,961 DEBUG HandlerThread:723175 [meta.py:_save_pip():57] save pip +2021-07-15 17:41:49,961 DEBUG HandlerThread:723175 [meta.py:_save_pip():71] save pip done +2021-07-15 17:41:49,961 DEBUG HandlerThread:723175 [meta.py:probe():252] probe done +2021-07-15 17:41:49,964 DEBUG SenderThread:723175 [sender.py:send():179] send: files +2021-07-15 17:41:49,964 INFO SenderThread:723175 [sender.py:_save_file():841] saving file wandb-metadata.json with policy now +2021-07-15 17:41:49,970 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:41:49,971 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:41:50,099 DEBUG SenderThread:723175 [sender.py:send():179] send: config +2021-07-15 17:41:50,100 DEBUG SenderThread:723175 [sender.py:send():179] send: config +2021-07-15 17:41:50,100 DEBUG SenderThread:723175 [sender.py:send():179] send: config +2021-07-15 17:41:50,404 INFO Thread-11 :723175 [upload_job.py:push():137] Uploaded file /tmp/tmplksa3t0ywandb/5lvlwhj5-wandb-metadata.json +2021-07-15 17:41:50,815 INFO Thread-8 :723175 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/requirements.txt +2021-07-15 17:41:50,816 INFO Thread-8 :723175 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/output.log +2021-07-15 17:41:50,816 INFO Thread-8 :723175 [dir_watcher.py:_on_file_created():216] file/dir created: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/wandb-metadata.json +2021-07-15 17:42:04,821 INFO Thread-8 :723175 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/output.log +2021-07-15 17:42:05,202 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:42:05,203 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:42:06,821 INFO Thread-8 :723175 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/output.log +2021-07-15 17:42:08,822 INFO Thread-8 :723175 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/output.log +2021-07-15 17:42:18,016 DEBUG SenderThread:723175 [sender.py:send():179] send: stats +2021-07-15 17:42:19,827 INFO Thread-8 :723175 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/config.yaml +2021-07-15 17:42:20,347 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:42:20,347 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:42:35,479 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:42:35,480 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:42:48,096 DEBUG SenderThread:723175 [sender.py:send():179] send: stats +2021-07-15 17:42:50,612 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:42:50,612 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:43:05,746 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:43:05,747 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:43:18,173 DEBUG SenderThread:723175 [sender.py:send():179] send: stats +2021-07-15 17:43:20,877 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:43:20,878 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:43:36,007 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:43:36,008 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:43:48,239 DEBUG SenderThread:723175 [sender.py:send():179] send: stats +2021-07-15 17:43:51,139 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:43:51,139 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:44:06,269 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:44:06,269 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:44:18,311 DEBUG SenderThread:723175 [sender.py:send():179] send: stats +2021-07-15 17:44:21,400 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:44:21,400 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:44:36,530 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:44:36,530 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:44:48,385 DEBUG SenderThread:723175 [sender.py:send():179] send: stats +2021-07-15 17:44:51,661 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:44:51,662 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:45:06,796 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:45:06,796 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:45:18,460 DEBUG SenderThread:723175 [sender.py:send():179] send: stats +2021-07-15 17:45:21,928 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:45:21,928 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:45:37,059 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:45:37,059 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:45:48,530 DEBUG SenderThread:723175 [sender.py:send():179] send: stats +2021-07-15 17:45:52,197 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:45:52,198 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:46:07,329 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:46:07,329 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:46:18,600 DEBUG SenderThread:723175 [sender.py:send():179] send: stats +2021-07-15 17:46:22,464 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:46:22,464 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:46:37,595 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:46:37,595 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:46:48,677 DEBUG SenderThread:723175 [sender.py:send():179] send: stats +2021-07-15 17:46:52,729 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:46:52,729 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:47:07,861 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:47:07,862 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:47:18,753 DEBUG SenderThread:723175 [sender.py:send():179] send: stats +2021-07-15 17:47:22,995 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: stop_status +2021-07-15 17:47:22,995 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: stop_status +2021-07-15 17:47:31,673 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: poll_exit +2021-07-15 17:47:31,673 DEBUG SenderThread:723175 [sender.py:send():179] send: telemetry +2021-07-15 17:47:31,673 DEBUG SenderThread:723175 [sender.py:send():179] send: exit +2021-07-15 17:47:31,673 INFO SenderThread:723175 [sender.py:send_exit():287] handling exit code: 1 +2021-07-15 17:47:31,674 INFO SenderThread:723175 [sender.py:send_exit():295] send defer +2021-07-15 17:47:31,674 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: poll_exit +2021-07-15 17:47:31,674 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: defer +2021-07-15 17:47:31,674 INFO HandlerThread:723175 [handler.py:handle_request_defer():141] handle defer: 0 +2021-07-15 17:47:31,674 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: defer +2021-07-15 17:47:31,674 INFO SenderThread:723175 [sender.py:send_request_defer():304] handle sender defer: 0 +2021-07-15 17:47:31,675 INFO SenderThread:723175 [sender.py:transition_state():308] send defer: 1 +2021-07-15 17:47:31,675 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: defer +2021-07-15 17:47:31,675 INFO HandlerThread:723175 [handler.py:handle_request_defer():141] handle defer: 1 +2021-07-15 17:47:31,686 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: defer +2021-07-15 17:47:31,686 INFO SenderThread:723175 [sender.py:send_request_defer():304] handle sender defer: 1 +2021-07-15 17:47:31,686 INFO SenderThread:723175 [sender.py:transition_state():308] send defer: 2 +2021-07-15 17:47:31,686 DEBUG SenderThread:723175 [sender.py:send():179] send: stats +2021-07-15 17:47:31,687 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: defer +2021-07-15 17:47:31,687 INFO HandlerThread:723175 [handler.py:handle_request_defer():141] handle defer: 2 +2021-07-15 17:47:31,687 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: defer +2021-07-15 17:47:31,687 INFO SenderThread:723175 [sender.py:send_request_defer():304] handle sender defer: 2 +2021-07-15 17:47:31,687 INFO SenderThread:723175 [sender.py:transition_state():308] send defer: 3 +2021-07-15 17:47:31,687 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: defer +2021-07-15 17:47:31,687 INFO HandlerThread:723175 [handler.py:handle_request_defer():141] handle defer: 3 +2021-07-15 17:47:31,687 DEBUG SenderThread:723175 [sender.py:send():179] send: summary +2021-07-15 17:47:31,688 INFO SenderThread:723175 [sender.py:_save_file():841] saving file wandb-summary.json with policy end +2021-07-15 17:47:31,688 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: defer +2021-07-15 17:47:31,688 INFO SenderThread:723175 [sender.py:send_request_defer():304] handle sender defer: 3 +2021-07-15 17:47:31,688 INFO SenderThread:723175 [sender.py:transition_state():308] send defer: 4 +2021-07-15 17:47:31,688 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: defer +2021-07-15 17:47:31,688 INFO HandlerThread:723175 [handler.py:handle_request_defer():141] handle defer: 4 +2021-07-15 17:47:31,688 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: defer +2021-07-15 17:47:31,688 INFO SenderThread:723175 [sender.py:send_request_defer():304] handle sender defer: 4 +2021-07-15 17:47:31,776 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: poll_exit +2021-07-15 17:47:31,881 INFO SenderThread:723175 [sender.py:transition_state():308] send defer: 5 +2021-07-15 17:47:31,881 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: poll_exit +2021-07-15 17:47:31,881 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: defer +2021-07-15 17:47:31,881 INFO HandlerThread:723175 [handler.py:handle_request_defer():141] handle defer: 5 +2021-07-15 17:47:31,882 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: defer +2021-07-15 17:47:31,882 INFO SenderThread:723175 [sender.py:send_request_defer():304] handle sender defer: 5 +2021-07-15 17:47:31,882 INFO SenderThread:723175 [dir_watcher.py:finish():282] shutting down directory watcher +2021-07-15 17:47:31,891 INFO SenderThread:723175 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/config.yaml +2021-07-15 17:47:31,891 INFO SenderThread:723175 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/wandb-summary.json +2021-07-15 17:47:31,891 INFO SenderThread:723175 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/output.log +2021-07-15 17:47:31,891 INFO SenderThread:723175 [dir_watcher.py:finish():312] scan: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files +2021-07-15 17:47:31,891 INFO SenderThread:723175 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/requirements.txt requirements.txt +2021-07-15 17:47:31,892 INFO SenderThread:723175 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/output.log output.log +2021-07-15 17:47:31,892 INFO SenderThread:723175 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/wandb-metadata.json wandb-metadata.json +2021-07-15 17:47:31,892 INFO SenderThread:723175 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/config.yaml config.yaml +2021-07-15 17:47:31,892 INFO SenderThread:723175 [dir_watcher.py:finish():318] scan save: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/wandb-summary.json wandb-summary.json +2021-07-15 17:47:31,895 INFO SenderThread:723175 [sender.py:transition_state():308] send defer: 6 +2021-07-15 17:47:31,896 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: defer +2021-07-15 17:47:31,896 INFO HandlerThread:723175 [handler.py:handle_request_defer():141] handle defer: 6 +2021-07-15 17:47:31,896 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: defer +2021-07-15 17:47:31,896 INFO SenderThread:723175 [sender.py:send_request_defer():304] handle sender defer: 6 +2021-07-15 17:47:31,896 INFO SenderThread:723175 [file_pusher.py:finish():177] shutting down file pusher +2021-07-15 17:47:31,983 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: poll_exit +2021-07-15 17:47:31,983 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: poll_exit +2021-07-15 17:47:32,085 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: poll_exit +2021-07-15 17:47:32,085 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: poll_exit +2021-07-15 17:47:32,187 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: poll_exit +2021-07-15 17:47:32,187 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: poll_exit +2021-07-15 17:47:32,289 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: poll_exit +2021-07-15 17:47:32,289 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: poll_exit +2021-07-15 17:47:32,333 INFO Thread-12 :723175 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/requirements.txt +2021-07-15 17:47:32,347 INFO Thread-13 :723175 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/output.log +2021-07-15 17:47:32,391 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: poll_exit +2021-07-15 17:47:32,391 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: poll_exit +2021-07-15 17:47:32,408 INFO Thread-14 :723175 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/config.yaml +2021-07-15 17:47:32,444 INFO Thread-15 :723175 [upload_job.py:push():137] Uploaded file /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/files/wandb-summary.json +2021-07-15 17:47:32,493 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: poll_exit +2021-07-15 17:47:32,493 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: poll_exit +2021-07-15 17:47:32,595 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: poll_exit +2021-07-15 17:47:32,595 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: poll_exit +2021-07-15 17:47:32,645 INFO Thread-7 :723175 [sender.py:transition_state():308] send defer: 7 +2021-07-15 17:47:32,645 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: defer +2021-07-15 17:47:32,645 INFO HandlerThread:723175 [handler.py:handle_request_defer():141] handle defer: 7 +2021-07-15 17:47:32,646 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: defer +2021-07-15 17:47:32,646 INFO SenderThread:723175 [sender.py:send_request_defer():304] handle sender defer: 7 +2021-07-15 17:47:32,697 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: poll_exit +2021-07-15 17:47:32,951 INFO SenderThread:723175 [sender.py:transition_state():308] send defer: 8 +2021-07-15 17:47:32,951 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: poll_exit +2021-07-15 17:47:32,952 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: defer +2021-07-15 17:47:32,952 INFO HandlerThread:723175 [handler.py:handle_request_defer():141] handle defer: 8 +2021-07-15 17:47:32,952 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: defer +2021-07-15 17:47:32,952 INFO SenderThread:723175 [sender.py:send_request_defer():304] handle sender defer: 8 +2021-07-15 17:47:32,952 INFO SenderThread:723175 [sender.py:transition_state():308] send defer: 9 +2021-07-15 17:47:32,953 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: defer +2021-07-15 17:47:32,953 INFO HandlerThread:723175 [handler.py:handle_request_defer():141] handle defer: 9 +2021-07-15 17:47:32,953 DEBUG SenderThread:723175 [sender.py:send():179] send: final +2021-07-15 17:47:32,953 DEBUG SenderThread:723175 [sender.py:send():179] send: footer +2021-07-15 17:47:32,953 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: defer +2021-07-15 17:47:32,953 INFO SenderThread:723175 [sender.py:send_request_defer():304] handle sender defer: 9 +2021-07-15 17:47:33,053 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: poll_exit +2021-07-15 17:47:33,053 DEBUG SenderThread:723175 [sender.py:send_request():193] send_request: poll_exit +2021-07-15 17:47:33,053 INFO SenderThread:723175 [file_pusher.py:join():182] waiting for file pusher +2021-07-15 17:47:33,055 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: get_summary +2021-07-15 17:47:33,055 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: sampled_history +2021-07-15 17:47:33,056 DEBUG HandlerThread:723175 [handler.py:handle_request():124] handle_request: shutdown +2021-07-15 17:47:33,056 INFO HandlerThread:723175 [handler.py:finish():638] shutting down handler +2021-07-15 17:47:33,953 INFO WriterThread:723175 [datastore.py:close():288] close: /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/run-3nkn7hxg.wandb +2021-07-15 17:47:34,054 INFO SenderThread:723175 [sender.py:finish():945] shutting down sender +2021-07-15 17:47:34,054 INFO SenderThread:723175 [file_pusher.py:finish():177] shutting down file pusher +2021-07-15 17:47:34,054 INFO SenderThread:723175 [file_pusher.py:join():182] waiting for file pusher +2021-07-15 17:47:34,056 INFO MainThread:723175 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20210715_174147-3nkn7hxg/logs/debug.log b/wandb/run-20210715_174147-3nkn7hxg/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..affc03c9cab77edd23a12ed219249961bf797b5b --- /dev/null +++ b/wandb/run-20210715_174147-3nkn7hxg/logs/debug.log @@ -0,0 +1,119 @@ +2021-07-15 17:41:47,906 INFO MainThread:721922 [wandb_setup.py:_flush():69] setting env: {} +2021-07-15 17:41:47,906 INFO MainThread:721922 [wandb_setup.py:_flush():69] setting login settings: {} +2021-07-15 17:41:47,906 INFO MainThread:721922 [wandb_init.py:_log_setup():337] Logging user logs to /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/logs/debug.log +2021-07-15 17:41:47,906 INFO MainThread:721922 [wandb_init.py:_log_setup():338] Logging internal logs to /home/dat/pino-roberta-base/wandb/run-20210715_174147-3nkn7hxg/logs/debug-internal.log +2021-07-15 17:41:47,906 INFO MainThread:721922 [wandb_init.py:init():370] calling init triggers +2021-07-15 17:41:47,906 INFO MainThread:721922 [wandb_init.py:init():375] wandb.init called with sweep_config: {} +config: {} +2021-07-15 17:41:47,906 INFO MainThread:721922 [wandb_init.py:init():419] starting backend +2021-07-15 17:41:47,906 INFO MainThread:721922 [backend.py:_multiprocessing_setup():70] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2021-07-15 17:41:47,954 INFO MainThread:721922 [backend.py:ensure_launched():135] starting backend process... +2021-07-15 17:41:48,000 INFO MainThread:721922 [backend.py:ensure_launched():139] started backend process with pid: 723175 +2021-07-15 17:41:48,002 INFO MainThread:721922 [wandb_init.py:init():424] backend started and connected +2021-07-15 17:41:48,005 INFO MainThread:721922 [wandb_init.py:init():472] updated telemetry +2021-07-15 17:41:48,006 INFO MainThread:721922 [wandb_init.py:init():491] communicating current version +2021-07-15 17:41:48,633 INFO MainThread:721922 [wandb_init.py:init():496] got version response +2021-07-15 17:41:48,633 INFO MainThread:721922 [wandb_init.py:init():504] communicating run to backend with 30 second timeout +2021-07-15 17:41:48,805 INFO MainThread:721922 [wandb_init.py:init():529] starting run threads in backend +2021-07-15 17:41:49,968 INFO MainThread:721922 [wandb_run.py:_console_start():1623] atexit reg +2021-07-15 17:41:49,968 INFO MainThread:721922 [wandb_run.py:_redirect():1497] redirect: SettingsConsole.REDIRECT +2021-07-15 17:41:49,969 INFO MainThread:721922 [wandb_run.py:_redirect():1502] Redirecting console. +2021-07-15 17:41:49,971 INFO MainThread:721922 [wandb_run.py:_redirect():1558] Redirects installed. +2021-07-15 17:41:49,971 INFO MainThread:721922 [wandb_init.py:init():554] run started, returning control to user process +2021-07-15 17:41:49,977 INFO MainThread:721922 [wandb_run.py:_config_callback():872] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'IntervalStrategy.NO', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 3e-05, 'weight_decay': 0.0095, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 2.0, 'max_steps': -1, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'warmup_ratio': 0.0, 'warmup_steps': 10000, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/Jul15_17-41-39_t1v-n-f5c06ea1-w-0', 'logging_strategy': 'IntervalStrategy.STEPS', 'logging_first_step': False, 'logging_steps': 50, 'save_strategy': 'IntervalStrategy.STEPS', 'save_steps': 15000, 'save_total_limit': 20, 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'fp16': False, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 6000, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': './', 'push_to_hub_model_id': '', 'push_to_hub_organization': None, 'push_to_hub_token': None, 'mp_parameters': ''} +2021-07-15 17:41:49,978 INFO MainThread:721922 [wandb_run.py:_config_callback():872] config_cb None None {'model_name_or_path': None, 'model_type': 'big_bird', 'config_name': './', 'tokenizer_name': './', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'float32'} +2021-07-15 17:41:49,980 INFO MainThread:721922 [wandb_run.py:_config_callback():872] config_cb None None {'dataset_name': None, 'dataset_config_name': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 4096, 'preprocessing_num_workers': 96, 'mlm_probability': 0.15, 'pad_to_max_length': False, 'line_by_line': False, 'max_eval_samples': 4000} +2021-07-15 17:47:28,749 INFO MainThread:721922 [wandb_run.py:_atexit_cleanup():1593] got exitcode: 1 +2021-07-15 17:47:28,750 INFO MainThread:721922 [wandb_run.py:_restore():1565] restore +2021-07-15 17:47:31,674 INFO MainThread:721922 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts { + wandb_count: 1 +} +pusher_stats { + uploaded_bytes: 1415 + total_bytes: 1415 +} + +2021-07-15 17:47:31,882 INFO MainThread:721922 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts { + wandb_count: 1 +} +pusher_stats { + uploaded_bytes: 1415 + total_bytes: 1415 +} + +2021-07-15 17:47:31,984 INFO MainThread:721922 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 1415 + total_bytes: 8975 +} + +2021-07-15 17:47:32,086 INFO MainThread:721922 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 8975 + total_bytes: 8975 +} + +2021-07-15 17:47:32,188 INFO MainThread:721922 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 8975 + total_bytes: 8975 +} + +2021-07-15 17:47:32,290 INFO MainThread:721922 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 8975 + total_bytes: 8975 +} + +2021-07-15 17:47:32,392 INFO MainThread:721922 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 8975 + total_bytes: 8975 +} + +2021-07-15 17:47:32,494 INFO MainThread:721922 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 8975 + total_bytes: 8975 +} + +2021-07-15 17:47:32,596 INFO MainThread:721922 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 8975 + total_bytes: 8975 +} + +2021-07-15 17:47:32,952 INFO MainThread:721922 [wandb_run.py:_wait_for_finish():1715] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 8975 + total_bytes: 8975 +} + +2021-07-15 17:47:33,054 INFO MainThread:721922 [wandb_run.py:_wait_for_finish():1715] got exit ret: done: true +exit_result { +} +file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 8975 + total_bytes: 8975 +} + +2021-07-15 17:47:34,335 INFO MainThread:721922 [wandb_run.py:_show_files():1937] logging synced files diff --git a/wandb/run-20210715_174147-3nkn7hxg/run-3nkn7hxg.wandb b/wandb/run-20210715_174147-3nkn7hxg/run-3nkn7hxg.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e9ca073cb13e33acef714823c6d91f2af0da94bc Binary files /dev/null and b/wandb/run-20210715_174147-3nkn7hxg/run-3nkn7hxg.wandb differ diff --git a/wandb/run-20210715_175147-3lygnexi/files/config.yaml b/wandb/run-20210715_175147-3lygnexi/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4c9e9e835d421e400d8cf1706ade39520bae9383 --- /dev/null +++ b/wandb/run-20210715_175147-3lygnexi/files/config.yaml @@ -0,0 +1,301 @@ +wandb_version: 1 + +_wandb: + desc: null + value: + cli_version: 0.10.33 + framework: huggingface + huggingface_version: 4.9.0.dev0 + is_jupyter_run: false + is_kaggle_kernel: false + python_version: 3.8.10 + t: + 1: + - 3 + - 11 + 4: 3.8.10 + 5: 0.10.33 + 6: 4.9.0.dev0 + 8: + - 5 +adafactor: + desc: null + value: false +adam_beta1: + desc: null + value: 0.9 +adam_beta2: + desc: null + value: 0.98 +adam_epsilon: + desc: null + value: 1.0e-08 +cache_dir: + desc: null + value: null +config_name: + desc: null + value: ./ +dataloader_drop_last: + desc: null + value: false +dataloader_num_workers: + desc: null + value: 0 +dataloader_pin_memory: + desc: null + value: true +dataset_config_name: + desc: null + value: null +dataset_name: + desc: null + value: null +ddp_find_unused_parameters: + desc: null + value: null +debug: + desc: null + value: [] +deepspeed: + desc: null + value: null +disable_tqdm: + desc: null + value: false +do_eval: + desc: null + value: false +do_predict: + desc: null + value: false +do_train: + desc: null + value: false +dtype: + desc: null + value: float32 +eval_accumulation_steps: + desc: null + value: null +eval_steps: + desc: null + value: 6000 +evaluation_strategy: + desc: null + value: IntervalStrategy.NO +fp16: + desc: null + value: false +fp16_backend: + desc: null + value: auto +fp16_full_eval: + desc: null + value: false +fp16_opt_level: + desc: null + value: O1 +gradient_accumulation_steps: + desc: null + value: 1 +greater_is_better: + desc: null + value: null +group_by_length: + desc: null + value: false +ignore_data_skip: + desc: null + value: false +label_names: + desc: null + value: null +label_smoothing_factor: + desc: null + value: 0.0 +learning_rate: + desc: null + value: 3.0e-05 +length_column_name: + desc: null + value: length +line_by_line: + desc: null + value: false +load_best_model_at_end: + desc: null + value: false +local_rank: + desc: null + value: -1 +log_level: + desc: null + value: -1 +log_level_replica: + desc: null + value: -1 +log_on_each_node: + desc: null + value: true +logging_dir: + desc: null + value: ./runs/Jul15_17-51-39_t1v-n-f5c06ea1-w-0 +logging_first_step: + desc: null + value: false +logging_steps: + desc: null + value: 50 +logging_strategy: + desc: null + value: IntervalStrategy.STEPS +lr_scheduler_type: + desc: null + value: SchedulerType.LINEAR +max_eval_samples: + desc: null + value: 4000 +max_grad_norm: + desc: null + value: 1.0 +max_seq_length: + desc: null + value: 4096 +max_steps: + desc: null + value: -1 +metric_for_best_model: + desc: null + value: null +mlm_probability: + desc: null + value: 0.15 +model_name_or_path: + desc: null + value: null +model_type: + desc: null + value: big_bird +mp_parameters: + desc: null + value: '' +no_cuda: + desc: null + value: false +num_train_epochs: + desc: null + value: 2.0 +output_dir: + desc: null + value: ./ +overwrite_cache: + desc: null + value: false +overwrite_output_dir: + desc: null + value: true +pad_to_max_length: + desc: null + value: false +past_index: + desc: null + value: -1 +per_device_eval_batch_size: + desc: null + value: 1 +per_device_train_batch_size: + desc: null + value: 1 +per_gpu_eval_batch_size: + desc: null + value: null +per_gpu_train_batch_size: + desc: null + value: null +prediction_loss_only: + desc: null + value: false +preprocessing_num_workers: + desc: null + value: 96 +push_to_hub: + desc: null + value: true +push_to_hub_model_id: + desc: null + value: '' +push_to_hub_organization: + desc: null + value: null +push_to_hub_token: + desc: null + value: null +remove_unused_columns: + desc: null + value: true +report_to: + desc: null + value: + - tensorboard + - wandb +resume_from_checkpoint: + desc: null + value: ./ +run_name: + desc: null + value: ./ +save_on_each_node: + desc: null + value: false +save_steps: + desc: null + value: 15000 +save_strategy: + desc: null + value: IntervalStrategy.STEPS +save_total_limit: + desc: null + value: 20 +seed: + desc: null + value: 42 +sharded_ddp: + desc: null + value: [] +skip_memory_metrics: + desc: null + value: true +tokenizer_name: + desc: null + value: ./ +tpu_metrics_debug: + desc: null + value: false +tpu_num_cores: + desc: null + value: null +train_ref_file: + desc: null + value: null +use_fast_tokenizer: + desc: null + value: true +use_legacy_prediction_loop: + desc: null + value: false +validation_ref_file: + desc: null + value: null +validation_split_percentage: + desc: null + value: 5 +warmup_ratio: + desc: null + value: 0.0 +warmup_steps: + desc: null + value: 10000 +weight_decay: + desc: null + value: 0.0095 diff --git a/wandb/run-20210715_175147-3lygnexi/files/output.log b/wandb/run-20210715_175147-3lygnexi/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..9234f68c77d927517199a2dd72846f157688defe --- /dev/null +++ b/wandb/run-20210715_175147-3lygnexi/files/output.log @@ -0,0 +1,26 @@ + +[17:52:01] - INFO - absl - Restoring checkpoint from ./checkpoint_10000 +tcmalloc: large alloc 1530273792 bytes == 0x9b410000 @ 0x7f6acebf3680 0x7f6acec14824 0x5b9a14 0x50b2ae 0x50cb1b 0x5a6f17 0x5f3010 0x56fd36 0x568d9a 0x5f5b33 0x56aadf 0x568d9a 0x68cdc7 0x67e161 0x67e1df 0x67e281 0x67e627 0x6b6e62 0x6b71ed 0x7f6acea080b3 0x5f96de +/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:386: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code. + warnings.warn( +/home/dat/pino/lib/python3.8/site-packages/jax/lib/xla_bridge.py:373: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code. + warnings.warn( +Epoch ... (1/2): 0%| | 0/2 [00:00