amazingvince commited on
Commit
1d6d730
·
verified ·
1 Parent(s): a1481e0

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoints/wandb/run-20240902_170304-v43qltex/run-v43qltex.wandb filter=lfs diff=lfs merge=lfs -text
checkpoints/.hydra/config.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mode: pt
2
+ device: gpu
3
+ precision: bf16
4
+ eval_only: false
5
+ predict_only: false
6
+ seed: 2137
7
+ tokenizer:
8
+ name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
9
+ model:
10
+ klass: custom_seq2seq
11
+ name: google/t5-v1_1-base
12
+ overwrite: null
13
+ add_config: null
14
+ checkpoint_path: ''
15
+ random_init: true
16
+ compile: true
17
+ data:
18
+ input_length: 1024
19
+ mlm_probability: 0.15
20
+ mean_noise_span_length: 3.0
21
+ num_workers: 8
22
+ optim:
23
+ name: adamwscale
24
+ base_lr: 0.02
25
+ batch_size: 64
26
+ total_steps: 65536
27
+ epochs: -1
28
+ warmup_steps: 10000
29
+ lr_scheduler: cosine
30
+ weight_decay: 0.001
31
+ grad_clip: 1.0
32
+ grad_acc: 4
33
+ final_cosine: 1.0e-05
34
+ eval:
35
+ every_steps: 100000
36
+ steps: 500
37
+ checkpoint:
38
+ every_steps: 5000
39
+ logging:
40
+ every_steps: 100
41
+ grad_l2: true
42
+ weights_l2: true
43
+ use_wandb: true
44
+ wandb_config:
45
+ project: nano-custom-seq2seq
46
+ entity: amazingvince
47
+ tags:
48
+ - nanoT5
49
+ - my_tag
50
+ mode: online
checkpoints/.hydra/hydra.yaml ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: main
117
+ chdir: true
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: default
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.1'
132
+ cwd: /workspace/nanoT5
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /workspace/nanoT5/nanoT5/configs
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /workspace/nanoT5/logs/2024-09-02/17-03-02
144
+ choices:
145
+ local_env: default
146
+ task: pt
147
+ hydra/env: default
148
+ hydra/callbacks: null
149
+ hydra/job_logging: default
150
+ hydra/hydra_logging: default
151
+ hydra/hydra_help: default
152
+ hydra/help: default
153
+ hydra/sweeper: basic
154
+ hydra/launcher: basic
155
+ hydra/output: default
156
+ verbose: false
checkpoints/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
checkpoints/checkpoint-pt-10000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de209ea75162ba234b5a0ac46f2434ad29c106e4770c8c587eba8ac390f7fede
3
+ size 2692370584
checkpoints/checkpoint-pt-10000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50b14a8e8e4cb1f87530bb13452da585006a1a54e1fa02069afa73d0775f0736
3
+ size 14344
checkpoints/checkpoint-pt-5000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7134d1e0be9b58fd378c0b681679ae41855daa9cbb0dc80b24e826ef59861ce
3
+ size 2692370584
checkpoints/checkpoint-pt-5000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50b14a8e8e4cb1f87530bb13452da585006a1a54e1fa02069afa73d0775f0736
3
+ size 14344
checkpoints/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_probs_dropout_prob": 0.0,
3
+ "bos_token_id": 1,
4
+ "decoder_start_token_id": 3,
5
+ "eos_token_id": 2,
6
+ "head_dim": 64,
7
+ "hidden_act": "silu",
8
+ "hidden_dropout_prob": 0.0,
9
+ "hidden_size": 1024,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 1024,
14
+ "num_attention_heads": 16,
15
+ "num_decoder_layers": 32,
16
+ "num_encoder_layers": 16,
17
+ "num_key_value_heads": 4,
18
+ "pad_token_id": 3,
19
+ "rotary_emb_base": 10000.0,
20
+ "rotary_emb_dim": 32,
21
+ "rotary_emb_interleaved": false,
22
+ "rotary_emb_scale_base": null,
23
+ "transformers_version": "4.44.2",
24
+ "use_cache": true,
25
+ "vocab_size": 48256
26
+ }
checkpoints/main.log ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2024-09-02 17:03:02,219][accelerate.utils.other][WARNING] - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
2
+ [2024-09-02 17:03:02,226][Main][INFO] - Distributed environment: DistributedType.NO
3
+ Num processes: 1
4
+ Process index: 0
5
+ Local process index: 0
6
+ Device: cuda
7
+
8
+ Mixed precision type: bf16
9
+
10
+ [2024-09-02 17:03:02,227][Main][INFO] - Working directory is /workspace/nanoT5/logs/2024-09-02/17-03-02
11
+ [2024-09-02 17:14:53,691][Main][INFO] - [train] Step 100 out of 65536 | Loss --> 51.971 | Grad_l2 --> 82.676 | Weights_l2 --> 7042.062 | Lr --> 0.010 | Seconds_per_step --> 6.760 |
12
+ [2024-09-02 17:20:23,699][Main][INFO] - [train] Step 200 out of 65536 | Loss --> 14.150 | Grad_l2 --> 19.390 | Weights_l2 --> 7034.376 | Lr --> 0.010 | Seconds_per_step --> 3.300 |
13
+ [2024-09-02 17:25:54,840][Main][INFO] - [train] Step 300 out of 65536 | Loss --> 9.006 | Grad_l2 --> 9.061 | Weights_l2 --> 7026.824 | Lr --> 0.010 | Seconds_per_step --> 3.311 |
14
+ [2024-09-02 17:31:26,095][Main][INFO] - [train] Step 400 out of 65536 | Loss --> 7.529 | Grad_l2 --> 5.889 | Weights_l2 --> 7019.014 | Lr --> 0.010 | Seconds_per_step --> 3.313 |
15
+ [2024-09-02 17:36:56,190][Main][INFO] - [train] Step 500 out of 65536 | Loss --> 6.618 | Grad_l2 --> 4.039 | Weights_l2 --> 7010.897 | Lr --> 0.011 | Seconds_per_step --> 3.301 |
16
+ [2024-09-02 17:42:27,693][Main][INFO] - [train] Step 600 out of 65536 | Loss --> 5.994 | Grad_l2 --> 2.962 | Weights_l2 --> 7002.549 | Lr --> 0.011 | Seconds_per_step --> 3.315 |
17
+ [2024-09-02 17:47:57,967][Main][INFO] - [train] Step 700 out of 65536 | Loss --> 5.703 | Grad_l2 --> 2.434 | Weights_l2 --> 6994.267 | Lr --> 0.011 | Seconds_per_step --> 3.303 |
18
+ [2024-09-02 17:53:29,228][Main][INFO] - [train] Step 800 out of 65536 | Loss --> 6.603 | Grad_l2 --> 6.221 | Weights_l2 --> 6985.927 | Lr --> 0.011 | Seconds_per_step --> 3.313 |
19
+ [2024-09-02 17:59:00,011][Main][INFO] - [train] Step 900 out of 65536 | Loss --> 5.408 | Grad_l2 --> 1.465 | Weights_l2 --> 6980.026 | Lr --> 0.011 | Seconds_per_step --> 3.308 |
20
+ [2024-09-02 18:04:30,275][Main][INFO] - [train] Step 1000 out of 65536 | Loss --> 5.311 | Grad_l2 --> 0.992 | Weights_l2 --> 6975.109 | Lr --> 0.011 | Seconds_per_step --> 3.303 |
21
+ [2024-09-02 18:10:01,468][Main][INFO] - [train] Step 1100 out of 65536 | Loss --> 5.241 | Grad_l2 --> 0.854 | Weights_l2 --> 6970.708 | Lr --> 0.011 | Seconds_per_step --> 3.312 |
22
+ [2024-09-02 18:15:33,362][Main][INFO] - [train] Step 1200 out of 65536 | Loss --> 5.180 | Grad_l2 --> 0.838 | Weights_l2 --> 6966.641 | Lr --> 0.011 | Seconds_per_step --> 3.319 |
23
+ [2024-09-02 18:21:03,902][Main][INFO] - [train] Step 1300 out of 65536 | Loss --> 5.126 | Grad_l2 --> 0.764 | Weights_l2 --> 6962.789 | Lr --> 0.011 | Seconds_per_step --> 3.305 |
24
+ [2024-09-02 18:26:35,349][Main][INFO] - [train] Step 1400 out of 65536 | Loss --> 5.088 | Grad_l2 --> 0.744 | Weights_l2 --> 6959.146 | Lr --> 0.011 | Seconds_per_step --> 3.314 |
25
+ [2024-09-02 18:32:06,048][Main][INFO] - [train] Step 1500 out of 65536 | Loss --> 5.046 | Grad_l2 --> 0.702 | Weights_l2 --> 6955.673 | Lr --> 0.012 | Seconds_per_step --> 3.307 |
26
+ [2024-09-02 18:37:37,903][Main][INFO] - [train] Step 1600 out of 65536 | Loss --> 5.007 | Grad_l2 --> 0.691 | Weights_l2 --> 6952.523 | Lr --> 0.012 | Seconds_per_step --> 3.319 |
27
+ [2024-09-02 18:43:09,723][Main][INFO] - [train] Step 1700 out of 65536 | Loss --> 4.973 | Grad_l2 --> 0.673 | Weights_l2 --> 6949.412 | Lr --> 0.012 | Seconds_per_step --> 3.318 |
28
+ [2024-09-02 18:48:40,909][Main][INFO] - [train] Step 1800 out of 65536 | Loss --> 4.943 | Grad_l2 --> 0.671 | Weights_l2 --> 6946.498 | Lr --> 0.012 | Seconds_per_step --> 3.312 |
29
+ [2024-09-02 18:54:13,524][Main][INFO] - [train] Step 1900 out of 65536 | Loss --> 4.929 | Grad_l2 --> 0.668 | Weights_l2 --> 6943.795 | Lr --> 0.012 | Seconds_per_step --> 3.326 |
30
+ [2024-09-02 18:59:45,500][Main][INFO] - [train] Step 2000 out of 65536 | Loss --> 4.894 | Grad_l2 --> 0.665 | Weights_l2 --> 6941.241 | Lr --> 0.012 | Seconds_per_step --> 3.320 |
31
+ [2024-09-02 19:05:16,395][Main][INFO] - [train] Step 2100 out of 65536 | Loss --> 4.881 | Grad_l2 --> 0.713 | Weights_l2 --> 6938.861 | Lr --> 0.012 | Seconds_per_step --> 3.309 |
32
+ [2024-09-02 19:10:48,520][Main][INFO] - [train] Step 2200 out of 65536 | Loss --> 4.853 | Grad_l2 --> 0.653 | Weights_l2 --> 6936.551 | Lr --> 0.012 | Seconds_per_step --> 3.321 |
33
+ [2024-09-02 19:16:19,278][Main][INFO] - [train] Step 2300 out of 65536 | Loss --> 4.829 | Grad_l2 --> 0.646 | Weights_l2 --> 6934.357 | Lr --> 0.012 | Seconds_per_step --> 3.308 |
34
+ [2024-09-02 19:21:51,370][Main][INFO] - [train] Step 2400 out of 65536 | Loss --> 4.790 | Grad_l2 --> 0.620 | Weights_l2 --> 6932.338 | Lr --> 0.012 | Seconds_per_step --> 3.321 |
35
+ [2024-09-02 19:27:23,544][Main][INFO] - [train] Step 2500 out of 65536 | Loss --> 4.784 | Grad_l2 --> 0.643 | Weights_l2 --> 6930.395 | Lr --> 0.013 | Seconds_per_step --> 3.322 |
36
+ [2024-09-02 19:32:54,341][Main][INFO] - [train] Step 2600 out of 65536 | Loss --> 4.755 | Grad_l2 --> 0.623 | Weights_l2 --> 6928.543 | Lr --> 0.013 | Seconds_per_step --> 3.308 |
37
+ [2024-09-02 19:38:25,942][Main][INFO] - [train] Step 2700 out of 65536 | Loss --> 4.743 | Grad_l2 --> 0.636 | Weights_l2 --> 6926.944 | Lr --> 0.013 | Seconds_per_step --> 3.316 |
38
+ [2024-09-02 19:43:57,708][Main][INFO] - [train] Step 2800 out of 65536 | Loss --> 4.722 | Grad_l2 --> 0.590 | Weights_l2 --> 6925.379 | Lr --> 0.013 | Seconds_per_step --> 3.318 |
39
+ [2024-09-02 19:49:28,285][Main][INFO] - [train] Step 2900 out of 65536 | Loss --> 4.715 | Grad_l2 --> 0.622 | Weights_l2 --> 6924.007 | Lr --> 0.013 | Seconds_per_step --> 3.306 |
40
+ [2024-09-02 19:54:59,957][Main][INFO] - [train] Step 3000 out of 65536 | Loss --> 4.694 | Grad_l2 --> 0.652 | Weights_l2 --> 6922.709 | Lr --> 0.013 | Seconds_per_step --> 3.317 |
41
+ [2024-09-02 20:00:31,072][Main][INFO] - [train] Step 3100 out of 65536 | Loss --> 4.678 | Grad_l2 --> 0.614 | Weights_l2 --> 6921.561 | Lr --> 0.013 | Seconds_per_step --> 3.311 |
42
+ [2024-09-02 20:06:02,747][Main][INFO] - [train] Step 3200 out of 65536 | Loss --> 4.633 | Grad_l2 --> 0.610 | Weights_l2 --> 6920.463 | Lr --> 0.013 | Seconds_per_step --> 3.317 |
43
+ [2024-09-02 20:11:34,607][Main][INFO] - [train] Step 3300 out of 65536 | Loss --> 4.599 | Grad_l2 --> 0.638 | Weights_l2 --> 6919.642 | Lr --> 0.013 | Seconds_per_step --> 3.319 |
44
+ [2024-09-02 20:17:05,731][Main][INFO] - [train] Step 3400 out of 65536 | Loss --> 4.549 | Grad_l2 --> 0.774 | Weights_l2 --> 6919.263 | Lr --> 0.013 | Seconds_per_step --> 3.311 |
45
+ [2024-09-02 20:22:37,601][Main][INFO] - [train] Step 3500 out of 65536 | Loss --> 4.420 | Grad_l2 --> 0.934 | Weights_l2 --> 6918.974 | Lr --> 0.014 | Seconds_per_step --> 3.319 |
46
+ [2024-09-02 20:28:09,554][Main][INFO] - [train] Step 3600 out of 65536 | Loss --> 4.256 | Grad_l2 --> 0.763 | Weights_l2 --> 6919.477 | Lr --> 0.014 | Seconds_per_step --> 3.319 |
47
+ [2024-09-02 20:33:40,654][Main][INFO] - [train] Step 3700 out of 65536 | Loss --> 4.131 | Grad_l2 --> 0.657 | Weights_l2 --> 6920.705 | Lr --> 0.014 | Seconds_per_step --> 3.311 |
48
+ [2024-09-02 20:39:13,064][Main][INFO] - [train] Step 3800 out of 65536 | Loss --> 4.021 | Grad_l2 --> 0.709 | Weights_l2 --> 6922.188 | Lr --> 0.014 | Seconds_per_step --> 3.324 |
49
+ [2024-09-02 20:44:45,663][Main][INFO] - [train] Step 3900 out of 65536 | Loss --> 3.909 | Grad_l2 --> 0.637 | Weights_l2 --> 6923.666 | Lr --> 0.014 | Seconds_per_step --> 3.326 |
50
+ [2024-09-02 20:50:16,811][Main][INFO] - [train] Step 4000 out of 65536 | Loss --> 3.855 | Grad_l2 --> 1.013 | Weights_l2 --> 6923.778 | Lr --> 0.014 | Seconds_per_step --> 3.311 |
51
+ [2024-09-02 20:55:49,235][Main][INFO] - [train] Step 4100 out of 65536 | Loss --> 3.770 | Grad_l2 --> 0.589 | Weights_l2 --> 6925.545 | Lr --> 0.014 | Seconds_per_step --> 3.324 |
52
+ [2024-09-02 21:01:20,500][Main][INFO] - [train] Step 4200 out of 65536 | Loss --> 3.710 | Grad_l2 --> 0.579 | Weights_l2 --> 6927.200 | Lr --> 0.014 | Seconds_per_step --> 3.313 |
53
+ [2024-09-02 21:06:53,406][Main][INFO] - [train] Step 4300 out of 65536 | Loss --> 3.651 | Grad_l2 --> 0.588 | Weights_l2 --> 6928.842 | Lr --> 0.014 | Seconds_per_step --> 3.329 |
54
+ [2024-09-02 21:12:26,298][Main][INFO] - [train] Step 4400 out of 65536 | Loss --> 3.614 | Grad_l2 --> 0.632 | Weights_l2 --> 6930.597 | Lr --> 0.014 | Seconds_per_step --> 3.329 |
55
+ [2024-09-02 21:17:57,623][Main][INFO] - [train] Step 4500 out of 65536 | Loss --> 3.582 | Grad_l2 --> 0.884 | Weights_l2 --> 6931.569 | Lr --> 0.015 | Seconds_per_step --> 3.313 |
56
+ [2024-09-02 21:23:30,116][Main][INFO] - [train] Step 4600 out of 65536 | Loss --> 3.527 | Grad_l2 --> 0.582 | Weights_l2 --> 6933.783 | Lr --> 0.015 | Seconds_per_step --> 3.325 |
57
+ [2024-09-02 21:29:02,417][Main][INFO] - [train] Step 4700 out of 65536 | Loss --> 3.476 | Grad_l2 --> 0.549 | Weights_l2 --> 6935.959 | Lr --> 0.015 | Seconds_per_step --> 3.323 |
58
+ [2024-09-02 21:34:33,535][Main][INFO] - [train] Step 4800 out of 65536 | Loss --> 3.430 | Grad_l2 --> 0.551 | Weights_l2 --> 6938.224 | Lr --> 0.015 | Seconds_per_step --> 3.311 |
59
+ [2024-09-02 21:40:05,905][Main][INFO] - [train] Step 4900 out of 65536 | Loss --> 3.395 | Grad_l2 --> 0.550 | Weights_l2 --> 6940.617 | Lr --> 0.015 | Seconds_per_step --> 3.324 |
60
+ [2024-09-02 21:45:36,944][Main][INFO] - [train] Step 5000 out of 65536 | Loss --> 3.366 | Grad_l2 --> 0.546 | Weights_l2 --> 6943.230 | Lr --> 0.015 | Seconds_per_step --> 3.310 |
61
+ [2024-09-02 21:45:36,947][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-5000
62
+ [2024-09-02 21:45:36,954][accelerate.utils.other][WARNING] - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
63
+ [2024-09-02 21:45:44,182][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-5000/model.safetensors
64
+ [2024-09-02 21:45:54,822][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-5000/optimizer.bin
65
+ [2024-09-02 21:45:54,827][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-5000/scheduler.bin
66
+ [2024-09-02 21:45:54,828][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-5000/sampler.bin
67
+ [2024-09-02 21:45:54,829][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-5000/sampler_1.bin
68
+ [2024-09-02 21:45:54,835][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-5000/random_states_0.pkl
69
+ [2024-09-02 21:51:26,402][Main][INFO] - [train] Step 5100 out of 65536 | Loss --> 3.302 | Grad_l2 --> 0.541 | Weights_l2 --> 6946.278 | Lr --> 0.015 | Seconds_per_step --> 3.495 |
70
+ [2024-09-02 21:56:58,321][Main][INFO] - [train] Step 5200 out of 65536 | Loss --> 3.248 | Grad_l2 --> 0.556 | Weights_l2 --> 6950.060 | Lr --> 0.015 | Seconds_per_step --> 3.319 |
71
+ [2024-09-02 22:02:29,452][Main][INFO] - [train] Step 5300 out of 65536 | Loss --> 3.194 | Grad_l2 --> 0.566 | Weights_l2 --> 6954.461 | Lr --> 0.015 | Seconds_per_step --> 3.311 |
72
+ [2024-09-02 22:08:01,594][Main][INFO] - [train] Step 5400 out of 65536 | Loss --> 3.144 | Grad_l2 --> 0.548 | Weights_l2 --> 6959.061 | Lr --> 0.015 | Seconds_per_step --> 3.321 |
73
+ [2024-09-02 22:13:33,473][Main][INFO] - [train] Step 5500 out of 65536 | Loss --> 3.099 | Grad_l2 --> 0.546 | Weights_l2 --> 6963.676 | Lr --> 0.016 | Seconds_per_step --> 3.319 |
74
+ [2024-09-02 22:19:04,763][Main][INFO] - [train] Step 5600 out of 65536 | Loss --> 3.044 | Grad_l2 --> 0.531 | Weights_l2 --> 6968.055 | Lr --> 0.016 | Seconds_per_step --> 3.313 |
75
+ [2024-09-02 22:24:37,024][Main][INFO] - [train] Step 5700 out of 65536 | Loss --> 3.023 | Grad_l2 --> 0.528 | Weights_l2 --> 6972.595 | Lr --> 0.016 | Seconds_per_step --> 3.323 |
76
+ [2024-09-02 22:30:08,010][Main][INFO] - [train] Step 5800 out of 65536 | Loss --> 2.999 | Grad_l2 --> 0.529 | Weights_l2 --> 6977.095 | Lr --> 0.016 | Seconds_per_step --> 3.310 |
77
+ [2024-09-02 22:35:40,260][Main][INFO] - [train] Step 5900 out of 65536 | Loss --> 2.953 | Grad_l2 --> 0.516 | Weights_l2 --> 6981.522 | Lr --> 0.016 | Seconds_per_step --> 3.322 |
78
+ [2024-09-02 22:41:12,494][Main][INFO] - [train] Step 6000 out of 65536 | Loss --> 2.924 | Grad_l2 --> 0.514 | Weights_l2 --> 6985.860 | Lr --> 0.016 | Seconds_per_step --> 3.322 |
79
+ [2024-09-02 22:46:43,439][Main][INFO] - [train] Step 6100 out of 65536 | Loss --> 2.904 | Grad_l2 --> 0.500 | Weights_l2 --> 6990.209 | Lr --> 0.016 | Seconds_per_step --> 3.309 |
80
+ [2024-09-02 22:52:15,361][Main][INFO] - [train] Step 6200 out of 65536 | Loss --> 2.885 | Grad_l2 --> 0.499 | Weights_l2 --> 6994.575 | Lr --> 0.016 | Seconds_per_step --> 3.319 |
81
+ [2024-09-02 22:57:47,371][Main][INFO] - [train] Step 6300 out of 65536 | Loss --> 2.860 | Grad_l2 --> 0.496 | Weights_l2 --> 6998.855 | Lr --> 0.016 | Seconds_per_step --> 3.320 |
82
+ [2024-09-02 23:03:18,243][Main][INFO] - [train] Step 6400 out of 65536 | Loss --> 2.828 | Grad_l2 --> 0.486 | Weights_l2 --> 7003.354 | Lr --> 0.016 | Seconds_per_step --> 3.309 |
83
+ [2024-09-02 23:08:50,256][Main][INFO] - [train] Step 6500 out of 65536 | Loss --> 2.823 | Grad_l2 --> 0.491 | Weights_l2 --> 7007.772 | Lr --> 0.017 | Seconds_per_step --> 3.320 |
84
+ [2024-09-02 23:14:21,254][Main][INFO] - [train] Step 6600 out of 65536 | Loss --> 2.801 | Grad_l2 --> 0.572 | Weights_l2 --> 7012.034 | Lr --> 0.017 | Seconds_per_step --> 3.310 |
85
+ [2024-09-02 23:19:53,383][Main][INFO] - [train] Step 6700 out of 65536 | Loss --> 2.776 | Grad_l2 --> 0.473 | Weights_l2 --> 7016.624 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
86
+ [2024-09-02 23:25:25,894][Main][INFO] - [train] Step 6800 out of 65536 | Loss --> 2.764 | Grad_l2 --> 0.489 | Weights_l2 --> 7021.128 | Lr --> 0.017 | Seconds_per_step --> 3.325 |
87
+ [2024-09-02 23:30:56,990][Main][INFO] - [train] Step 6900 out of 65536 | Loss --> 2.754 | Grad_l2 --> 0.467 | Weights_l2 --> 7025.909 | Lr --> 0.017 | Seconds_per_step --> 3.311 |
88
+ [2024-09-02 23:36:28,837][Main][INFO] - [train] Step 7000 out of 65536 | Loss --> 2.716 | Grad_l2 --> 0.469 | Weights_l2 --> 7030.583 | Lr --> 0.017 | Seconds_per_step --> 3.318 |
89
+ [2024-09-02 23:42:00,897][Main][INFO] - [train] Step 7100 out of 65536 | Loss --> 2.706 | Grad_l2 --> 0.470 | Weights_l2 --> 7035.338 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
90
+ [2024-09-02 23:47:31,913][Main][INFO] - [train] Step 7200 out of 65536 | Loss --> 2.685 | Grad_l2 --> 0.460 | Weights_l2 --> 7040.107 | Lr --> 0.017 | Seconds_per_step --> 3.310 |
91
+ [2024-09-02 23:53:04,028][Main][INFO] - [train] Step 7300 out of 65536 | Loss --> 2.675 | Grad_l2 --> 0.462 | Weights_l2 --> 7044.921 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
92
+ [2024-09-02 23:58:35,224][Main][INFO] - [train] Step 7400 out of 65536 | Loss --> 2.670 | Grad_l2 --> 0.473 | Weights_l2 --> 7049.994 | Lr --> 0.017 | Seconds_per_step --> 3.312 |
93
+ [2024-09-03 00:04:07,495][Main][INFO] - [train] Step 7500 out of 65536 | Loss --> 2.653 | Grad_l2 --> 0.452 | Weights_l2 --> 7055.123 | Lr --> 0.018 | Seconds_per_step --> 3.323 |
94
+ [2024-09-03 00:09:39,687][Main][INFO] - [train] Step 7600 out of 65536 | Loss --> 2.644 | Grad_l2 --> 0.499 | Weights_l2 --> 7060.263 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
95
+ [2024-09-03 00:15:11,125][Main][INFO] - [train] Step 7700 out of 65536 | Loss --> 2.619 | Grad_l2 --> 0.451 | Weights_l2 --> 7065.593 | Lr --> 0.018 | Seconds_per_step --> 3.314 |
96
+ [2024-09-03 00:20:43,656][Main][INFO] - [train] Step 7800 out of 65536 | Loss --> 2.611 | Grad_l2 --> 0.444 | Weights_l2 --> 7071.016 | Lr --> 0.018 | Seconds_per_step --> 3.325 |
97
+ [2024-09-03 00:26:15,825][Main][INFO] - [train] Step 7900 out of 65536 | Loss --> 2.593 | Grad_l2 --> 0.444 | Weights_l2 --> 7076.338 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
98
+ [2024-09-03 00:31:46,986][Main][INFO] - [train] Step 8000 out of 65536 | Loss --> 2.591 | Grad_l2 --> 0.707 | Weights_l2 --> 7081.619 | Lr --> 0.018 | Seconds_per_step --> 3.312 |
99
+ [2024-09-03 00:37:19,240][Main][INFO] - [train] Step 8100 out of 65536 | Loss --> 2.583 | Grad_l2 --> 0.504 | Weights_l2 --> 7087.303 | Lr --> 0.018 | Seconds_per_step --> 3.323 |
100
+ [2024-09-03 00:42:50,497][Main][INFO] - [train] Step 8200 out of 65536 | Loss --> 2.572 | Grad_l2 --> 0.435 | Weights_l2 --> 7092.976 | Lr --> 0.018 | Seconds_per_step --> 3.313 |
101
+ [2024-09-03 00:48:22,669][Main][INFO] - [train] Step 8300 out of 65536 | Loss --> 2.550 | Grad_l2 --> 0.444 | Weights_l2 --> 7098.242 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
102
+ [2024-09-03 00:53:54,859][Main][INFO] - [train] Step 8400 out of 65536 | Loss --> 2.533 | Grad_l2 --> 0.424 | Weights_l2 --> 7103.870 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
103
+ [2024-09-03 00:59:25,959][Main][INFO] - [train] Step 8500 out of 65536 | Loss --> 2.520 | Grad_l2 --> 0.415 | Weights_l2 --> 7109.426 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
104
+ [2024-09-03 01:04:58,102][Main][INFO] - [train] Step 8600 out of 65536 | Loss --> 2.512 | Grad_l2 --> 0.445 | Weights_l2 --> 7115.243 | Lr --> 0.019 | Seconds_per_step --> 3.321 |
105
+ [2024-09-03 01:10:30,308][Main][INFO] - [train] Step 8700 out of 65536 | Loss --> 2.497 | Grad_l2 --> 0.416 | Weights_l2 --> 7120.917 | Lr --> 0.019 | Seconds_per_step --> 3.322 |
106
+ [2024-09-03 01:16:01,412][Main][INFO] - [train] Step 8800 out of 65536 | Loss --> 2.503 | Grad_l2 --> 0.453 | Weights_l2 --> 7127.067 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
107
+ [2024-09-03 01:21:33,679][Main][INFO] - [train] Step 8900 out of 65536 | Loss --> 2.498 | Grad_l2 --> 0.519 | Weights_l2 --> 7133.268 | Lr --> 0.019 | Seconds_per_step --> 3.323 |
108
+ [2024-09-03 01:27:05,633][Main][INFO] - [train] Step 9000 out of 65536 | Loss --> 2.480 | Grad_l2 --> 0.413 | Weights_l2 --> 7139.449 | Lr --> 0.019 | Seconds_per_step --> 3.320 |
109
+ [2024-09-03 01:32:36,839][Main][INFO] - [train] Step 9100 out of 65536 | Loss --> 2.488 | Grad_l2 --> 0.429 | Weights_l2 --> 7145.663 | Lr --> 0.019 | Seconds_per_step --> 3.312 |
110
+ [2024-09-03 01:38:09,090][Main][INFO] - [train] Step 9200 out of 65536 | Loss --> 2.458 | Grad_l2 --> 0.651 | Weights_l2 --> 7151.751 | Lr --> 0.019 | Seconds_per_step --> 3.322 |
111
+ [2024-09-03 01:43:40,183][Main][INFO] - [train] Step 9300 out of 65536 | Loss --> 2.481 | Grad_l2 --> 0.667 | Weights_l2 --> 7157.979 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
112
+ [2024-09-03 01:49:12,323][Main][INFO] - [train] Step 9400 out of 65536 | Loss --> 2.454 | Grad_l2 --> 0.500 | Weights_l2 --> 7164.722 | Lr --> 0.019 | Seconds_per_step --> 3.321 |
113
+ [2024-09-03 01:54:44,360][Main][INFO] - [train] Step 9500 out of 65536 | Loss --> 2.434 | Grad_l2 --> 0.434 | Weights_l2 --> 7171.100 | Lr --> 0.020 | Seconds_per_step --> 3.320 |
114
+ [2024-09-03 02:00:15,384][Main][INFO] - [train] Step 9600 out of 65536 | Loss --> 2.430 | Grad_l2 --> 0.459 | Weights_l2 --> 7177.669 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
115
+ [2024-09-03 02:05:47,653][Main][INFO] - [train] Step 9700 out of 65536 | Loss --> 2.435 | Grad_l2 --> 0.458 | Weights_l2 --> 7184.407 | Lr --> 0.020 | Seconds_per_step --> 3.323 |
116
+ [2024-09-03 02:11:19,839][Main][INFO] - [train] Step 9800 out of 65536 | Loss --> 2.431 | Grad_l2 --> 0.796 | Weights_l2 --> 7190.992 | Lr --> 0.020 | Seconds_per_step --> 3.322 |
117
+ [2024-09-03 02:16:50,929][Main][INFO] - [train] Step 9900 out of 65536 | Loss --> 2.403 | Grad_l2 --> 0.782 | Weights_l2 --> 7197.863 | Lr --> 0.020 | Seconds_per_step --> 3.311 |
118
+ [2024-09-03 02:22:23,236][Main][INFO] - [train] Step 10000 out of 65536 | Loss --> 2.445 | Grad_l2 --> 1.140 | Weights_l2 --> 7204.637 | Lr --> 0.020 | Seconds_per_step --> 3.323 |
119
+ [2024-09-03 02:22:23,238][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-10000
120
+ [2024-09-03 02:22:23,245][accelerate.utils.other][WARNING] - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
121
+ [2024-09-03 02:22:29,395][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-10000/model.safetensors
122
+ [2024-09-03 02:22:38,780][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-10000/optimizer.bin
123
+ [2024-09-03 02:22:38,784][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-10000/scheduler.bin
124
+ [2024-09-03 02:22:38,784][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-10000/sampler.bin
125
+ [2024-09-03 02:22:38,785][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-10000/sampler_1.bin
126
+ [2024-09-03 02:22:38,790][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-10000/random_states_0.pkl
127
+ [2024-09-03 02:28:09,713][Main][INFO] - [train] Step 10100 out of 65536 | Loss --> 2.441 | Grad_l2 --> 1.063 | Weights_l2 --> 7212.671 | Lr --> 0.020 | Seconds_per_step --> 3.465 |
128
+ [2024-09-03 02:33:42,096][Main][INFO] - [train] Step 10200 out of 65536 | Loss --> 2.421 | Grad_l2 --> 1.135 | Weights_l2 --> 7219.539 | Lr --> 0.020 | Seconds_per_step --> 3.324 |
129
+ [2024-09-03 02:39:14,331][Main][INFO] - [train] Step 10300 out of 65536 | Loss --> 2.408 | Grad_l2 --> 1.377 | Weights_l2 --> 7226.397 | Lr --> 0.020 | Seconds_per_step --> 3.322 |
130
+ [2024-09-03 02:44:45,309][Main][INFO] - [train] Step 10400 out of 65536 | Loss --> 2.385 | Grad_l2 --> 1.568 | Weights_l2 --> 7232.973 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
131
+ [2024-09-03 02:50:17,356][Main][INFO] - [train] Step 10500 out of 65536 | Loss --> 2.383 | Grad_l2 --> 5.267 | Weights_l2 --> 7238.788 | Lr --> 0.020 | Seconds_per_step --> 3.320 |
132
+ [2024-09-03 02:55:49,191][Main][INFO] - [train] Step 10600 out of 65536 | Loss --> 51.695 | Grad_l2 --> 2316.455 | Weights_l2 --> 7233.899 | Lr --> 0.020 | Seconds_per_step --> 3.318 |
133
+ [2024-09-03 03:01:20,350][Main][INFO] - [train] Step 10700 out of 65536 | Loss --> 19.189 | Grad_l2 --> 206.407 | Weights_l2 --> 7221.798 | Lr --> 0.020 | Seconds_per_step --> 3.312 |
134
+ [2024-09-03 03:06:52,743][Main][INFO] - [train] Step 10800 out of 65536 | Loss --> 6.908 | Grad_l2 --> 26.249 | Weights_l2 --> 7210.980 | Lr --> 0.020 | Seconds_per_step --> 3.324 |
135
+ [2024-09-03 03:12:23,733][Main][INFO] - [train] Step 10900 out of 65536 | Loss --> 42.736 | Grad_l2 --> 1292.659 | Weights_l2 --> 7206.464 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
checkpoints/wandb/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/wandb/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Current SDK version is 0.17.8
2
+ 2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Configure stats pid to 6499
3
+ 2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
4
+ 2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/settings
5
+ 2024-09-02 17:03:04,742 INFO MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
6
+ 2024-09-02 17:03:04,742 INFO MainThread:6499 [wandb_setup.py:_flush():77] Applying setup settings: {'_disable_service': False}
7
+ 2024-09-02 17:03:04,742 WARNING MainThread:6499 [wandb_setup.py:_flush():77] Could not find program at -m nanoT5.main
8
+ 2024-09-02 17:03:04,743 INFO MainThread:6499 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
9
+ 2024-09-02 17:03:04,743 INFO MainThread:6499 [wandb_setup.py:_flush():77] Applying login settings: {}
10
+ 2024-09-02 17:03:04,743 INFO MainThread:6499 [wandb_init.py:_log_setup():524] Logging user logs to /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/run-20240902_170304-v43qltex/logs/debug.log
11
+ 2024-09-02 17:03:04,744 INFO MainThread:6499 [wandb_init.py:_log_setup():525] Logging internal logs to /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/run-20240902_170304-v43qltex/logs/debug-internal.log
12
+ 2024-09-02 17:03:04,744 INFO MainThread:6499 [wandb_init.py:init():607] calling init triggers
13
+ 2024-09-02 17:03:04,744 INFO MainThread:6499 [wandb_init.py:init():614] wandb.init called with sweep_config: {}
14
+ config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 2137, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'model': {'klass': 'custom_seq2seq', 'name': 'google/t5-v1_1-base', 'overwrite': None, 'add_config': None, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 8}, 'optim': {'name': 'adamwscale', 'base_lr': 0.02, 'batch_size': 64, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 10000, 'lr_scheduler': 'cosine', 'weight_decay': 0.001, 'grad_clip': 1.0, 'grad_acc': 4, 'final_cosine': 1e-05}, 'eval': {'every_steps': 100000, 'steps': 500}, 'checkpoint': {'every_steps': 5000}, 'logging': {'every_steps': 100, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nano-custom-seq2seq', 'entity': 'amazingvince', 'tags': ['nanoT5', 'my_tag'], 'mode': 'online'}}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/logs/2024-09-02/17-03-02'}
15
+ 2024-09-02 17:03:04,745 INFO MainThread:6499 [wandb_init.py:init():657] starting backend
16
+ 2024-09-02 17:03:04,745 INFO MainThread:6499 [wandb_init.py:init():661] setting up manager
17
+ 2024-09-02 17:03:04,760 INFO MainThread:6499 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2024-09-02 17:03:04,761 INFO MainThread:6499 [wandb_init.py:init():669] backend started and connected
19
+ 2024-09-02 17:03:04,776 INFO MainThread:6499 [wandb_init.py:init():767] updated telemetry
20
+ 2024-09-02 17:03:04,819 INFO MainThread:6499 [wandb_init.py:init():800] communicating run to backend with 90.0 second timeout
21
+ 2024-09-02 17:03:05,519 INFO MainThread:6499 [wandb_init.py:init():851] starting run threads in backend
22
+ 2024-09-02 17:03:05,817 INFO MainThread:6499 [wandb_run.py:_console_start():2463] atexit reg
23
+ 2024-09-02 17:03:05,818 INFO MainThread:6499 [wandb_run.py:_redirect():2309] redirect: wrap_raw
24
+ 2024-09-02 17:03:05,819 INFO MainThread:6499 [wandb_run.py:_redirect():2374] Wrapping output streams.
25
+ 2024-09-02 17:03:05,819 INFO MainThread:6499 [wandb_run.py:_redirect():2399] Redirects installed.
26
+ 2024-09-02 17:03:05,822 INFO MainThread:6499 [wandb_init.py:init():894] run started, returning control to user process
27
+ 2024-09-02 17:03:35,512 INFO MainThread:6499 [wandb_run.py:_config_callback():1392] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 2137, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'model': {'klass': 'custom_seq2seq', 'name': 'google/t5-v1_1-base', 'overwrite': None, 'add_config': None, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 8, 'before_mask_input_length': 1137, 'target_length': 229}, 'optim': {'name': 'adamwscale', 'base_lr': 0.02, 'batch_size': 64, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 10000, 'lr_scheduler': 'cosine', 'weight_decay': 0.001, 'grad_clip': 1.0, 'grad_acc': 4, 'final_cosine': 1e-05}, 'eval': {'every_steps': 100000, 'steps': 500, 'corrected_steps': 500}, 'checkpoint': {'every_steps': 5000}, 'logging': {'every_steps': 100, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nano-custom-seq2seq', 'entity': 'amazingvince', 'tags': ['nanoT5', 'my_tag'], 'mode': 'online'}}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/logs/2024-09-02/17-03-02', 'n_all_param': 673076736}
28
+ 2024-09-03 03:17:10,763 WARNING MsgRouterThr:6499 [router.py:message_loop():77] message_loop has been closed
checkpoints/wandb/run-20240902_170304-v43qltex/files/config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ mode:
4
+ desc: null
5
+ value: pt
6
+ device:
7
+ desc: null
8
+ value: gpu
9
+ precision:
10
+ desc: null
11
+ value: bf16
12
+ eval_only:
13
+ desc: null
14
+ value: false
15
+ predict_only:
16
+ desc: null
17
+ value: false
18
+ seed:
19
+ desc: null
20
+ value: 2137
21
+ tokenizer:
22
+ desc: null
23
+ value:
24
+ name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
25
+ model:
26
+ desc: null
27
+ value:
28
+ klass: custom_seq2seq
29
+ name: google/t5-v1_1-base
30
+ overwrite: null
31
+ add_config: null
32
+ checkpoint_path: ''
33
+ random_init: true
34
+ compile: true
35
+ data:
36
+ desc: null
37
+ value:
38
+ input_length: 1024
39
+ mlm_probability: 0.15
40
+ mean_noise_span_length: 3.0
41
+ num_workers: 8
42
+ before_mask_input_length: 1137
43
+ target_length: 229
44
+ optim:
45
+ desc: null
46
+ value:
47
+ name: adamwscale
48
+ base_lr: 0.02
49
+ batch_size: 64
50
+ total_steps: 65536
51
+ epochs: -1
52
+ warmup_steps: 10000
53
+ lr_scheduler: cosine
54
+ weight_decay: 0.001
55
+ grad_clip: 1.0
56
+ grad_acc: 4
57
+ final_cosine: 1.0e-05
58
+ eval:
59
+ desc: null
60
+ value:
61
+ every_steps: 100000
62
+ steps: 500
63
+ corrected_steps: 500
64
+ checkpoint:
65
+ desc: null
66
+ value:
67
+ every_steps: 5000
68
+ logging:
69
+ desc: null
70
+ value:
71
+ every_steps: 100
72
+ grad_l2: true
73
+ weights_l2: true
74
+ use_wandb: true
75
+ wandb_config:
76
+ project: nano-custom-seq2seq
77
+ entity: amazingvince
78
+ tags:
79
+ - nanoT5
80
+ - my_tag
81
+ mode: online
82
+ slurm_id:
83
+ desc: null
84
+ value: none
85
+ working_dir:
86
+ desc: null
87
+ value: /workspace/nanoT5/logs/2024-09-02/17-03-02
88
+ _wandb:
89
+ desc: null
90
+ value:
91
+ python_version: 3.11.9
92
+ cli_version: 0.17.8
93
+ framework: huggingface
94
+ huggingface_version: 4.44.2
95
+ is_jupyter_run: false
96
+ is_kaggle_kernel: false
97
+ start_time: 1725296584
98
+ t:
99
+ 1:
100
+ - 1
101
+ - 11
102
+ - 41
103
+ - 49
104
+ - 50
105
+ - 51
106
+ - 55
107
+ - 71
108
+ - 100
109
+ 2:
110
+ - 1
111
+ - 11
112
+ - 41
113
+ - 49
114
+ - 50
115
+ - 51
116
+ - 55
117
+ - 71
118
+ - 100
119
+ 3:
120
+ - 15
121
+ - 16
122
+ - 23
123
+ - 61
124
+ 4: 3.11.9
125
+ 5: 0.17.8
126
+ 6: 4.44.2
127
+ 8:
128
+ - 5
129
+ 13: linux-x86_64
130
+ n_all_param:
131
+ desc: null
132
+ value: 673076736
checkpoints/wandb/run-20240902_170304-v43qltex/files/output.log ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/tokenizer.model
2
+ loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/tokenizer.json
3
+ loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/added_tokens.json
4
+ loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/special_tokens_map.json
5
+ loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/tokenizer_config.json
6
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
7
+ Resolving data files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 237.82it/s]
8
+ Resolving data files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 158787.76it/s]
9
+ Resolving data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 45.39it/s]
10
+ Resolving data files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 50142.87it/s]
11
+ Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 96827.44it/s]
12
+ Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 98644.87it/s]
13
+
14
+
15
+ Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:04<00:00, 223.35it/s]
16
+ Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:01<00:00, 908.08it/s]
17
+ Configuration saved in ./config.json
18
+ =========================================================================
19
+ Layer (type:depth-idx) Output Shape Param # Trainable
20
+ =========================================================================
21
+ CustomSeq2SeqLLM 673,076,736 True
22
+ Embedding 49,414,144 True
23
+ CustomEncoder 193,012,736 True
24
+ ModuleList 193,011,712 True
25
+ EncoderLayer 12,063,232 True
26
+ EncoderLayer 12,063,232 True
27
+ EncoderLayer 12,063,232 True
28
+ EncoderLayer 12,063,232 True
29
+ EncoderLayer 12,063,232 True
30
+ EncoderLayer 12,063,232 True
31
+ EncoderLayer 12,063,232 True
32
+ EncoderLayer 12,063,232 True
33
+ EncoderLayer 12,063,232 True
34
+ EncoderLayer 12,063,232 True
35
+ EncoderLayer 12,063,232 True
36
+ EncoderLayer 12,063,232 True
37
+ EncoderLayer 12,063,232 True
38
+ EncoderLayer 12,063,232 True
39
+ EncoderLayer 12,063,232 True
40
+ EncoderLayer 12,063,232 True
41
+ RMSNorm 1,024 True
42
+ CustomDecoder 430,649,856 True
43
+ ModuleList 430,648,832 True
44
+ DecoderLayer 14,688,256 True
45
+ DecoderLayer 12,063,232 True
46
+ DecoderLayer 14,688,256 True
47
+ DecoderLayer 12,063,232 True
48
+ DecoderLayer 14,688,256 True
49
+ DecoderLayer 12,063,232 True
50
+ DecoderLayer 14,688,256 True
51
+ DecoderLayer 12,063,232 True
52
+ DecoderLayer 14,688,256 True
53
+ DecoderLayer 12,063,232 True
54
+ DecoderLayer 14,688,256 True
55
+ DecoderLayer 12,063,232 True
56
+ DecoderLayer 14,688,256 True
57
+ DecoderLayer 12,063,232 True
58
+ DecoderLayer 14,688,256 True
59
+ DecoderLayer 12,063,232 True
60
+ DecoderLayer 14,688,256 True
61
+ DecoderLayer 12,063,232 True
62
+ DecoderLayer 14,688,256 True
63
+ DecoderLayer 12,063,232 True
64
+ DecoderLayer 14,688,256 True
65
+ DecoderLayer 12,063,232 True
66
+ DecoderLayer 14,688,256 True
67
+ DecoderLayer 12,063,232 True
68
+ DecoderLayer 14,688,256 True
69
+ DecoderLayer 12,063,232 True
70
+ DecoderLayer 14,688,256 True
71
+ DecoderLayer 12,063,232 True
72
+ DecoderLayer 14,688,256 True
73
+ DecoderLayer 12,063,232 True
74
+ DecoderLayer 14,688,256 True
75
+ DecoderLayer 14,688,256 True
76
+ RMSNorm 1,024 True
77
+ Linear 49,414,144 True
78
+ LigerCrossEntropyLoss -- False
79
+ =========================================================================
80
+ Total params: 673,076,736
81
+ Trainable params: 673,076,736
82
+ Non-trainable params: --
83
+ =========================================================================
84
+ W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] Graph break from `Tensor.item()`, consider setting:
85
+ W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] torch._dynamo.config.capture_scalar_outputs = True
86
+ W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] or:
87
+ W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
88
+ W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] to include these operations in the captured graph.
89
+ W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0]
90
+ [2024-09-02 17:14:53,691][Main][INFO] - [train] Step 100 out of 65536 | Loss --> 51.971 | Grad_l2 --> 82.676 | Weights_l2 --> 7042.062 | Lr --> 0.010 | Seconds_per_step --> 6.760 |
91
+ [2024-09-02 17:20:23,699][Main][INFO] - [train] Step 200 out of 65536 | Loss --> 14.150 | Grad_l2 --> 19.390 | Weights_l2 --> 7034.376 | Lr --> 0.010 | Seconds_per_step --> 3.300 |
92
+ [2024-09-02 17:25:54,840][Main][INFO] - [train] Step 300 out of 65536 | Loss --> 9.006 | Grad_l2 --> 9.061 | Weights_l2 --> 7026.824 | Lr --> 0.010 | Seconds_per_step --> 3.311 |
93
+ [2024-09-02 17:31:26,095][Main][INFO] - [train] Step 400 out of 65536 | Loss --> 7.529 | Grad_l2 --> 5.889 | Weights_l2 --> 7019.014 | Lr --> 0.010 | Seconds_per_step --> 3.313 |
94
+ [2024-09-02 17:36:56,190][Main][INFO] - [train] Step 500 out of 65536 | Loss --> 6.618 | Grad_l2 --> 4.039 | Weights_l2 --> 7010.897 | Lr --> 0.011 | Seconds_per_step --> 3.301 |
95
+ [2024-09-02 17:42:27,693][Main][INFO] - [train] Step 600 out of 65536 | Loss --> 5.994 | Grad_l2 --> 2.962 | Weights_l2 --> 7002.549 | Lr --> 0.011 | Seconds_per_step --> 3.315 |
96
+ [2024-09-02 17:47:57,967][Main][INFO] - [train] Step 700 out of 65536 | Loss --> 5.703 | Grad_l2 --> 2.434 | Weights_l2 --> 6994.267 | Lr --> 0.011 | Seconds_per_step --> 3.303 |
97
+ [2024-09-02 17:53:29,228][Main][INFO] - [train] Step 800 out of 65536 | Loss --> 6.603 | Grad_l2 --> 6.221 | Weights_l2 --> 6985.927 | Lr --> 0.011 | Seconds_per_step --> 3.313 |
98
+ [2024-09-02 17:59:00,011][Main][INFO] - [train] Step 900 out of 65536 | Loss --> 5.408 | Grad_l2 --> 1.465 | Weights_l2 --> 6980.026 | Lr --> 0.011 | Seconds_per_step --> 3.308 |
99
+ [2024-09-02 18:04:30,275][Main][INFO] - [train] Step 1000 out of 65536 | Loss --> 5.311 | Grad_l2 --> 0.992 | Weights_l2 --> 6975.109 | Lr --> 0.011 | Seconds_per_step --> 3.303 |
100
+ [2024-09-02 18:10:01,468][Main][INFO] - [train] Step 1100 out of 65536 | Loss --> 5.241 | Grad_l2 --> 0.854 | Weights_l2 --> 6970.708 | Lr --> 0.011 | Seconds_per_step --> 3.312 |
101
+ [2024-09-02 18:15:33,362][Main][INFO] - [train] Step 1200 out of 65536 | Loss --> 5.180 | Grad_l2 --> 0.838 | Weights_l2 --> 6966.641 | Lr --> 0.011 | Seconds_per_step --> 3.319 |
102
+ [2024-09-02 18:21:03,902][Main][INFO] - [train] Step 1300 out of 65536 | Loss --> 5.126 | Grad_l2 --> 0.764 | Weights_l2 --> 6962.789 | Lr --> 0.011 | Seconds_per_step --> 3.305 |
103
+ [2024-09-02 18:26:35,349][Main][INFO] - [train] Step 1400 out of 65536 | Loss --> 5.088 | Grad_l2 --> 0.744 | Weights_l2 --> 6959.146 | Lr --> 0.011 | Seconds_per_step --> 3.314 |
104
+ [2024-09-02 18:32:06,048][Main][INFO] - [train] Step 1500 out of 65536 | Loss --> 5.046 | Grad_l2 --> 0.702 | Weights_l2 --> 6955.673 | Lr --> 0.012 | Seconds_per_step --> 3.307 |
105
+ [2024-09-02 18:37:37,903][Main][INFO] - [train] Step 1600 out of 65536 | Loss --> 5.007 | Grad_l2 --> 0.691 | Weights_l2 --> 6952.523 | Lr --> 0.012 | Seconds_per_step --> 3.319 |
106
+ [2024-09-02 18:43:09,723][Main][INFO] - [train] Step 1700 out of 65536 | Loss --> 4.973 | Grad_l2 --> 0.673 | Weights_l2 --> 6949.412 | Lr --> 0.012 | Seconds_per_step --> 3.318 |
107
+ [2024-09-02 18:48:40,909][Main][INFO] - [train] Step 1800 out of 65536 | Loss --> 4.943 | Grad_l2 --> 0.671 | Weights_l2 --> 6946.498 | Lr --> 0.012 | Seconds_per_step --> 3.312 |
108
+ [2024-09-02 18:54:13,524][Main][INFO] - [train] Step 1900 out of 65536 | Loss --> 4.929 | Grad_l2 --> 0.668 | Weights_l2 --> 6943.795 | Lr --> 0.012 | Seconds_per_step --> 3.326 |
109
+ [2024-09-02 18:59:45,500][Main][INFO] - [train] Step 2000 out of 65536 | Loss --> 4.894 | Grad_l2 --> 0.665 | Weights_l2 --> 6941.241 | Lr --> 0.012 | Seconds_per_step --> 3.320 |
110
+ [2024-09-02 19:05:16,395][Main][INFO] - [train] Step 2100 out of 65536 | Loss --> 4.881 | Grad_l2 --> 0.713 | Weights_l2 --> 6938.861 | Lr --> 0.012 | Seconds_per_step --> 3.309 |
111
+ [2024-09-02 19:10:48,520][Main][INFO] - [train] Step 2200 out of 65536 | Loss --> 4.853 | Grad_l2 --> 0.653 | Weights_l2 --> 6936.551 | Lr --> 0.012 | Seconds_per_step --> 3.321 |
112
+ [2024-09-02 19:16:19,278][Main][INFO] - [train] Step 2300 out of 65536 | Loss --> 4.829 | Grad_l2 --> 0.646 | Weights_l2 --> 6934.357 | Lr --> 0.012 | Seconds_per_step --> 3.308 |
113
+ [2024-09-02 19:21:51,370][Main][INFO] - [train] Step 2400 out of 65536 | Loss --> 4.790 | Grad_l2 --> 0.620 | Weights_l2 --> 6932.338 | Lr --> 0.012 | Seconds_per_step --> 3.321 |
114
+ [2024-09-02 19:27:23,544][Main][INFO] - [train] Step 2500 out of 65536 | Loss --> 4.784 | Grad_l2 --> 0.643 | Weights_l2 --> 6930.395 | Lr --> 0.013 | Seconds_per_step --> 3.322 |
115
+ [2024-09-02 19:32:54,341][Main][INFO] - [train] Step 2600 out of 65536 | Loss --> 4.755 | Grad_l2 --> 0.623 | Weights_l2 --> 6928.543 | Lr --> 0.013 | Seconds_per_step --> 3.308 |
116
+ [2024-09-02 19:38:25,942][Main][INFO] - [train] Step 2700 out of 65536 | Loss --> 4.743 | Grad_l2 --> 0.636 | Weights_l2 --> 6926.944 | Lr --> 0.013 | Seconds_per_step --> 3.316 |
117
+ [2024-09-02 19:43:57,708][Main][INFO] - [train] Step 2800 out of 65536 | Loss --> 4.722 | Grad_l2 --> 0.590 | Weights_l2 --> 6925.379 | Lr --> 0.013 | Seconds_per_step --> 3.318 |
118
+ [2024-09-02 19:49:28,285][Main][INFO] - [train] Step 2900 out of 65536 | Loss --> 4.715 | Grad_l2 --> 0.622 | Weights_l2 --> 6924.007 | Lr --> 0.013 | Seconds_per_step --> 3.306 |
119
+ [2024-09-02 19:54:59,957][Main][INFO] - [train] Step 3000 out of 65536 | Loss --> 4.694 | Grad_l2 --> 0.652 | Weights_l2 --> 6922.709 | Lr --> 0.013 | Seconds_per_step --> 3.317 |
120
+ [2024-09-02 20:00:31,072][Main][INFO] - [train] Step 3100 out of 65536 | Loss --> 4.678 | Grad_l2 --> 0.614 | Weights_l2 --> 6921.561 | Lr --> 0.013 | Seconds_per_step --> 3.311 |
121
+ [2024-09-02 20:06:02,747][Main][INFO] - [train] Step 3200 out of 65536 | Loss --> 4.633 | Grad_l2 --> 0.610 | Weights_l2 --> 6920.463 | Lr --> 0.013 | Seconds_per_step --> 3.317 |
122
+ [2024-09-02 20:11:34,607][Main][INFO] - [train] Step 3300 out of 65536 | Loss --> 4.599 | Grad_l2 --> 0.638 | Weights_l2 --> 6919.642 | Lr --> 0.013 | Seconds_per_step --> 3.319 |
123
+ [2024-09-02 20:17:05,731][Main][INFO] - [train] Step 3400 out of 65536 | Loss --> 4.549 | Grad_l2 --> 0.774 | Weights_l2 --> 6919.263 | Lr --> 0.013 | Seconds_per_step --> 3.311 |
124
+ [2024-09-02 20:22:37,601][Main][INFO] - [train] Step 3500 out of 65536 | Loss --> 4.420 | Grad_l2 --> 0.934 | Weights_l2 --> 6918.974 | Lr --> 0.014 | Seconds_per_step --> 3.319 |
125
+ [2024-09-02 20:28:09,554][Main][INFO] - [train] Step 3600 out of 65536 | Loss --> 4.256 | Grad_l2 --> 0.763 | Weights_l2 --> 6919.477 | Lr --> 0.014 | Seconds_per_step --> 3.319 |
126
+ [2024-09-02 20:33:40,654][Main][INFO] - [train] Step 3700 out of 65536 | Loss --> 4.131 | Grad_l2 --> 0.657 | Weights_l2 --> 6920.705 | Lr --> 0.014 | Seconds_per_step --> 3.311 |
127
+ [2024-09-02 20:39:13,064][Main][INFO] - [train] Step 3800 out of 65536 | Loss --> 4.021 | Grad_l2 --> 0.709 | Weights_l2 --> 6922.188 | Lr --> 0.014 | Seconds_per_step --> 3.324 |
128
+ [2024-09-02 20:44:45,663][Main][INFO] - [train] Step 3900 out of 65536 | Loss --> 3.909 | Grad_l2 --> 0.637 | Weights_l2 --> 6923.666 | Lr --> 0.014 | Seconds_per_step --> 3.326 |
129
+ [2024-09-02 20:50:16,811][Main][INFO] - [train] Step 4000 out of 65536 | Loss --> 3.855 | Grad_l2 --> 1.013 | Weights_l2 --> 6923.778 | Lr --> 0.014 | Seconds_per_step --> 3.311 |
130
+ [2024-09-02 20:55:49,235][Main][INFO] - [train] Step 4100 out of 65536 | Loss --> 3.770 | Grad_l2 --> 0.589 | Weights_l2 --> 6925.545 | Lr --> 0.014 | Seconds_per_step --> 3.324 |
131
+ [2024-09-02 21:01:20,500][Main][INFO] - [train] Step 4200 out of 65536 | Loss --> 3.710 | Grad_l2 --> 0.579 | Weights_l2 --> 6927.200 | Lr --> 0.014 | Seconds_per_step --> 3.313 |
132
+ [2024-09-02 21:06:53,406][Main][INFO] - [train] Step 4300 out of 65536 | Loss --> 3.651 | Grad_l2 --> 0.588 | Weights_l2 --> 6928.842 | Lr --> 0.014 | Seconds_per_step --> 3.329 |
133
+ [2024-09-02 21:12:26,298][Main][INFO] - [train] Step 4400 out of 65536 | Loss --> 3.614 | Grad_l2 --> 0.632 | Weights_l2 --> 6930.597 | Lr --> 0.014 | Seconds_per_step --> 3.329 |
134
+ [2024-09-02 21:17:57,623][Main][INFO] - [train] Step 4500 out of 65536 | Loss --> 3.582 | Grad_l2 --> 0.884 | Weights_l2 --> 6931.569 | Lr --> 0.015 | Seconds_per_step --> 3.313 |
135
+ [2024-09-02 21:23:30,116][Main][INFO] - [train] Step 4600 out of 65536 | Loss --> 3.527 | Grad_l2 --> 0.582 | Weights_l2 --> 6933.783 | Lr --> 0.015 | Seconds_per_step --> 3.325 |
136
+ [2024-09-02 21:29:02,417][Main][INFO] - [train] Step 4700 out of 65536 | Loss --> 3.476 | Grad_l2 --> 0.549 | Weights_l2 --> 6935.959 | Lr --> 0.015 | Seconds_per_step --> 3.323 |
137
+ [2024-09-02 21:34:33,535][Main][INFO] - [train] Step 4800 out of 65536 | Loss --> 3.430 | Grad_l2 --> 0.551 | Weights_l2 --> 6938.224 | Lr --> 0.015 | Seconds_per_step --> 3.311 |
138
+ [2024-09-02 21:40:05,905][Main][INFO] - [train] Step 4900 out of 65536 | Loss --> 3.395 | Grad_l2 --> 0.550 | Weights_l2 --> 6940.617 | Lr --> 0.015 | Seconds_per_step --> 3.324 |
139
+ [2024-09-02 21:45:36,944][Main][INFO] - [train] Step 5000 out of 65536 | Loss --> 3.366 | Grad_l2 --> 0.546 | Weights_l2 --> 6943.230 | Lr --> 0.015 | Seconds_per_step --> 3.310 |
140
+ [2024-09-02 21:45:36,947][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-5000
141
+ [2024-09-02 21:45:36,954][accelerate.utils.other][WARNING] - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
142
+ [2024-09-02 21:45:44,182][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-5000/model.safetensors
143
+ [2024-09-02 21:45:54,822][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-5000/optimizer.bin
144
+ [2024-09-02 21:45:54,827][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-5000/scheduler.bin
145
+ [2024-09-02 21:45:54,828][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-5000/sampler.bin
146
+ [2024-09-02 21:45:54,829][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-5000/sampler_1.bin
147
+ [2024-09-02 21:45:54,835][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-5000/random_states_0.pkl
148
+ [2024-09-02 21:51:26,402][Main][INFO] - [train] Step 5100 out of 65536 | Loss --> 3.302 | Grad_l2 --> 0.541 | Weights_l2 --> 6946.278 | Lr --> 0.015 | Seconds_per_step --> 3.495 |
149
+ [2024-09-02 21:56:58,321][Main][INFO] - [train] Step 5200 out of 65536 | Loss --> 3.248 | Grad_l2 --> 0.556 | Weights_l2 --> 6950.060 | Lr --> 0.015 | Seconds_per_step --> 3.319 |
150
+ [2024-09-02 22:02:29,452][Main][INFO] - [train] Step 5300 out of 65536 | Loss --> 3.194 | Grad_l2 --> 0.566 | Weights_l2 --> 6954.461 | Lr --> 0.015 | Seconds_per_step --> 3.311 |
151
+ [2024-09-02 22:08:01,594][Main][INFO] - [train] Step 5400 out of 65536 | Loss --> 3.144 | Grad_l2 --> 0.548 | Weights_l2 --> 6959.061 | Lr --> 0.015 | Seconds_per_step --> 3.321 |
152
+ [2024-09-02 22:13:33,473][Main][INFO] - [train] Step 5500 out of 65536 | Loss --> 3.099 | Grad_l2 --> 0.546 | Weights_l2 --> 6963.676 | Lr --> 0.016 | Seconds_per_step --> 3.319 |
153
+ [2024-09-02 22:19:04,763][Main][INFO] - [train] Step 5600 out of 65536 | Loss --> 3.044 | Grad_l2 --> 0.531 | Weights_l2 --> 6968.055 | Lr --> 0.016 | Seconds_per_step --> 3.313 |
154
+ [2024-09-02 22:24:37,024][Main][INFO] - [train] Step 5700 out of 65536 | Loss --> 3.023 | Grad_l2 --> 0.528 | Weights_l2 --> 6972.595 | Lr --> 0.016 | Seconds_per_step --> 3.323 |
155
+ [2024-09-02 22:30:08,010][Main][INFO] - [train] Step 5800 out of 65536 | Loss --> 2.999 | Grad_l2 --> 0.529 | Weights_l2 --> 6977.095 | Lr --> 0.016 | Seconds_per_step --> 3.310 |
156
+ [2024-09-02 22:35:40,260][Main][INFO] - [train] Step 5900 out of 65536 | Loss --> 2.953 | Grad_l2 --> 0.516 | Weights_l2 --> 6981.522 | Lr --> 0.016 | Seconds_per_step --> 3.322 |
157
+ [2024-09-02 22:41:12,494][Main][INFO] - [train] Step 6000 out of 65536 | Loss --> 2.924 | Grad_l2 --> 0.514 | Weights_l2 --> 6985.860 | Lr --> 0.016 | Seconds_per_step --> 3.322 |
158
+ [2024-09-02 22:46:43,439][Main][INFO] - [train] Step 6100 out of 65536 | Loss --> 2.904 | Grad_l2 --> 0.500 | Weights_l2 --> 6990.209 | Lr --> 0.016 | Seconds_per_step --> 3.309 |
159
+ [2024-09-02 22:52:15,361][Main][INFO] - [train] Step 6200 out of 65536 | Loss --> 2.885 | Grad_l2 --> 0.499 | Weights_l2 --> 6994.575 | Lr --> 0.016 | Seconds_per_step --> 3.319 |
160
+ [2024-09-02 22:57:47,371][Main][INFO] - [train] Step 6300 out of 65536 | Loss --> 2.860 | Grad_l2 --> 0.496 | Weights_l2 --> 6998.855 | Lr --> 0.016 | Seconds_per_step --> 3.320 |
161
+ [2024-09-02 23:03:18,243][Main][INFO] - [train] Step 6400 out of 65536 | Loss --> 2.828 | Grad_l2 --> 0.486 | Weights_l2 --> 7003.354 | Lr --> 0.016 | Seconds_per_step --> 3.309 |
162
+ [2024-09-02 23:08:50,256][Main][INFO] - [train] Step 6500 out of 65536 | Loss --> 2.823 | Grad_l2 --> 0.491 | Weights_l2 --> 7007.772 | Lr --> 0.017 | Seconds_per_step --> 3.320 |
163
+ [2024-09-02 23:14:21,254][Main][INFO] - [train] Step 6600 out of 65536 | Loss --> 2.801 | Grad_l2 --> 0.572 | Weights_l2 --> 7012.034 | Lr --> 0.017 | Seconds_per_step --> 3.310 |
164
+ [2024-09-02 23:19:53,383][Main][INFO] - [train] Step 6700 out of 65536 | Loss --> 2.776 | Grad_l2 --> 0.473 | Weights_l2 --> 7016.624 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
165
+ [2024-09-02 23:25:25,894][Main][INFO] - [train] Step 6800 out of 65536 | Loss --> 2.764 | Grad_l2 --> 0.489 | Weights_l2 --> 7021.128 | Lr --> 0.017 | Seconds_per_step --> 3.325 |
166
+ [2024-09-02 23:30:56,990][Main][INFO] - [train] Step 6900 out of 65536 | Loss --> 2.754 | Grad_l2 --> 0.467 | Weights_l2 --> 7025.909 | Lr --> 0.017 | Seconds_per_step --> 3.311 |
167
+ [2024-09-02 23:36:28,837][Main][INFO] - [train] Step 7000 out of 65536 | Loss --> 2.716 | Grad_l2 --> 0.469 | Weights_l2 --> 7030.583 | Lr --> 0.017 | Seconds_per_step --> 3.318 |
168
+ [2024-09-02 23:42:00,897][Main][INFO] - [train] Step 7100 out of 65536 | Loss --> 2.706 | Grad_l2 --> 0.470 | Weights_l2 --> 7035.338 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
169
+ [2024-09-02 23:47:31,913][Main][INFO] - [train] Step 7200 out of 65536 | Loss --> 2.685 | Grad_l2 --> 0.460 | Weights_l2 --> 7040.107 | Lr --> 0.017 | Seconds_per_step --> 3.310 |
170
+ [2024-09-02 23:53:04,028][Main][INFO] - [train] Step 7300 out of 65536 | Loss --> 2.675 | Grad_l2 --> 0.462 | Weights_l2 --> 7044.921 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
171
+ [2024-09-02 23:58:35,224][Main][INFO] - [train] Step 7400 out of 65536 | Loss --> 2.670 | Grad_l2 --> 0.473 | Weights_l2 --> 7049.994 | Lr --> 0.017 | Seconds_per_step --> 3.312 |
172
+ [2024-09-03 00:04:07,495][Main][INFO] - [train] Step 7500 out of 65536 | Loss --> 2.653 | Grad_l2 --> 0.452 | Weights_l2 --> 7055.123 | Lr --> 0.018 | Seconds_per_step --> 3.323 |
173
+ [2024-09-03 00:09:39,687][Main][INFO] - [train] Step 7600 out of 65536 | Loss --> 2.644 | Grad_l2 --> 0.499 | Weights_l2 --> 7060.263 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
174
+ [2024-09-03 00:15:11,125][Main][INFO] - [train] Step 7700 out of 65536 | Loss --> 2.619 | Grad_l2 --> 0.451 | Weights_l2 --> 7065.593 | Lr --> 0.018 | Seconds_per_step --> 3.314 |
175
+ [2024-09-03 00:20:43,656][Main][INFO] - [train] Step 7800 out of 65536 | Loss --> 2.611 | Grad_l2 --> 0.444 | Weights_l2 --> 7071.016 | Lr --> 0.018 | Seconds_per_step --> 3.325 |
176
+ [2024-09-03 00:26:15,825][Main][INFO] - [train] Step 7900 out of 65536 | Loss --> 2.593 | Grad_l2 --> 0.444 | Weights_l2 --> 7076.338 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
177
+ [2024-09-03 00:31:46,986][Main][INFO] - [train] Step 8000 out of 65536 | Loss --> 2.591 | Grad_l2 --> 0.707 | Weights_l2 --> 7081.619 | Lr --> 0.018 | Seconds_per_step --> 3.312 |
178
+ [2024-09-03 00:37:19,240][Main][INFO] - [train] Step 8100 out of 65536 | Loss --> 2.583 | Grad_l2 --> 0.504 | Weights_l2 --> 7087.303 | Lr --> 0.018 | Seconds_per_step --> 3.323 |
179
+ [2024-09-03 00:42:50,497][Main][INFO] - [train] Step 8200 out of 65536 | Loss --> 2.572 | Grad_l2 --> 0.435 | Weights_l2 --> 7092.976 | Lr --> 0.018 | Seconds_per_step --> 3.313 |
180
+ [2024-09-03 00:48:22,669][Main][INFO] - [train] Step 8300 out of 65536 | Loss --> 2.550 | Grad_l2 --> 0.444 | Weights_l2 --> 7098.242 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
181
+ [2024-09-03 00:53:54,859][Main][INFO] - [train] Step 8400 out of 65536 | Loss --> 2.533 | Grad_l2 --> 0.424 | Weights_l2 --> 7103.870 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
182
+ [2024-09-03 00:59:25,959][Main][INFO] - [train] Step 8500 out of 65536 | Loss --> 2.520 | Grad_l2 --> 0.415 | Weights_l2 --> 7109.426 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
183
+ [2024-09-03 01:04:58,102][Main][INFO] - [train] Step 8600 out of 65536 | Loss --> 2.512 | Grad_l2 --> 0.445 | Weights_l2 --> 7115.243 | Lr --> 0.019 | Seconds_per_step --> 3.321 |
184
+ [2024-09-03 01:10:30,308][Main][INFO] - [train] Step 8700 out of 65536 | Loss --> 2.497 | Grad_l2 --> 0.416 | Weights_l2 --> 7120.917 | Lr --> 0.019 | Seconds_per_step --> 3.322 |
185
+ [2024-09-03 01:16:01,412][Main][INFO] - [train] Step 8800 out of 65536 | Loss --> 2.503 | Grad_l2 --> 0.453 | Weights_l2 --> 7127.067 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
186
+ [2024-09-03 01:21:33,679][Main][INFO] - [train] Step 8900 out of 65536 | Loss --> 2.498 | Grad_l2 --> 0.519 | Weights_l2 --> 7133.268 | Lr --> 0.019 | Seconds_per_step --> 3.323 |
187
+ [2024-09-03 01:27:05,633][Main][INFO] - [train] Step 9000 out of 65536 | Loss --> 2.480 | Grad_l2 --> 0.413 | Weights_l2 --> 7139.449 | Lr --> 0.019 | Seconds_per_step --> 3.320 |
188
+ [2024-09-03 01:32:36,839][Main][INFO] - [train] Step 9100 out of 65536 | Loss --> 2.488 | Grad_l2 --> 0.429 | Weights_l2 --> 7145.663 | Lr --> 0.019 | Seconds_per_step --> 3.312 |
189
+ [2024-09-03 01:38:09,090][Main][INFO] - [train] Step 9200 out of 65536 | Loss --> 2.458 | Grad_l2 --> 0.651 | Weights_l2 --> 7151.751 | Lr --> 0.019 | Seconds_per_step --> 3.322 |
190
+ [2024-09-03 01:43:40,183][Main][INFO] - [train] Step 9300 out of 65536 | Loss --> 2.481 | Grad_l2 --> 0.667 | Weights_l2 --> 7157.979 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
191
+ [2024-09-03 01:49:12,323][Main][INFO] - [train] Step 9400 out of 65536 | Loss --> 2.454 | Grad_l2 --> 0.500 | Weights_l2 --> 7164.722 | Lr --> 0.019 | Seconds_per_step --> 3.321 |
192
+ [2024-09-03 01:54:44,360][Main][INFO] - [train] Step 9500 out of 65536 | Loss --> 2.434 | Grad_l2 --> 0.434 | Weights_l2 --> 7171.100 | Lr --> 0.020 | Seconds_per_step --> 3.320 |
193
+ [2024-09-03 02:00:15,384][Main][INFO] - [train] Step 9600 out of 65536 | Loss --> 2.430 | Grad_l2 --> 0.459 | Weights_l2 --> 7177.669 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
194
+ [2024-09-03 02:05:47,653][Main][INFO] - [train] Step 9700 out of 65536 | Loss --> 2.435 | Grad_l2 --> 0.458 | Weights_l2 --> 7184.407 | Lr --> 0.020 | Seconds_per_step --> 3.323 |
195
+ [2024-09-03 02:11:19,839][Main][INFO] - [train] Step 9800 out of 65536 | Loss --> 2.431 | Grad_l2 --> 0.796 | Weights_l2 --> 7190.992 | Lr --> 0.020 | Seconds_per_step --> 3.322 |
196
+ [2024-09-03 02:16:50,929][Main][INFO] - [train] Step 9900 out of 65536 | Loss --> 2.403 | Grad_l2 --> 0.782 | Weights_l2 --> 7197.863 | Lr --> 0.020 | Seconds_per_step --> 3.311 |
197
+ [2024-09-03 02:22:23,236][Main][INFO] - [train] Step 10000 out of 65536 | Loss --> 2.445 | Grad_l2 --> 1.140 | Weights_l2 --> 7204.637 | Lr --> 0.020 | Seconds_per_step --> 3.323 |
198
+ [2024-09-03 02:22:23,238][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-10000
199
+ [2024-09-03 02:22:23,245][accelerate.utils.other][WARNING] - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
200
+ [2024-09-03 02:22:29,395][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-10000/model.safetensors
201
+ [2024-09-03 02:22:38,780][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-10000/optimizer.bin
202
+ [2024-09-03 02:22:38,784][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-10000/scheduler.bin
203
+ [2024-09-03 02:22:38,784][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-10000/sampler.bin
204
+ [2024-09-03 02:22:38,785][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-10000/sampler_1.bin
205
+ [2024-09-03 02:22:38,790][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-10000/random_states_0.pkl
206
+ [2024-09-03 02:28:09,713][Main][INFO] - [train] Step 10100 out of 65536 | Loss --> 2.441 | Grad_l2 --> 1.063 | Weights_l2 --> 7212.671 | Lr --> 0.020 | Seconds_per_step --> 3.465 |
207
+ [2024-09-03 02:33:42,096][Main][INFO] - [train] Step 10200 out of 65536 | Loss --> 2.421 | Grad_l2 --> 1.135 | Weights_l2 --> 7219.539 | Lr --> 0.020 | Seconds_per_step --> 3.324 |
208
+ [2024-09-03 02:39:14,331][Main][INFO] - [train] Step 10300 out of 65536 | Loss --> 2.408 | Grad_l2 --> 1.377 | Weights_l2 --> 7226.397 | Lr --> 0.020 | Seconds_per_step --> 3.322 |
209
+ [2024-09-03 02:44:45,309][Main][INFO] - [train] Step 10400 out of 65536 | Loss --> 2.385 | Grad_l2 --> 1.568 | Weights_l2 --> 7232.973 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
210
+ [2024-09-03 02:50:17,356][Main][INFO] - [train] Step 10500 out of 65536 | Loss --> 2.383 | Grad_l2 --> 5.267 | Weights_l2 --> 7238.788 | Lr --> 0.020 | Seconds_per_step --> 3.320 |
211
+ [2024-09-03 02:55:49,191][Main][INFO] - [train] Step 10600 out of 65536 | Loss --> 51.695 | Grad_l2 --> 2316.455 | Weights_l2 --> 7233.899 | Lr --> 0.020 | Seconds_per_step --> 3.318 |
212
+ [2024-09-03 03:01:20,350][Main][INFO] - [train] Step 10700 out of 65536 | Loss --> 19.189 | Grad_l2 --> 206.407 | Weights_l2 --> 7221.798 | Lr --> 0.020 | Seconds_per_step --> 3.312 |
213
+ [2024-09-03 03:06:52,743][Main][INFO] - [train] Step 10800 out of 65536 | Loss --> 6.908 | Grad_l2 --> 26.249 | Weights_l2 --> 7210.980 | Lr --> 0.020 | Seconds_per_step --> 3.324 |
214
+ [2024-09-03 03:12:23,733][Main][INFO] - [train] Step 10900 out of 65536 | Loss --> 42.736 | Grad_l2 --> 1292.659 | Weights_l2 --> 7206.464 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
215
+ Traceback (most recent call last):
216
+ File "<frozen runpy>", line 198, in _run_module_as_main
217
+ File "<frozen runpy>", line 88, in _run_code
218
+ File "/workspace/nanoT5/nanoT5/main.py", line 92, in <module>
219
+ main()
220
+ File "/usr/local/lib/python3.11/dist-packages/hydra/main.py", line 94, in decorated_main
221
+ _run_hydra(
222
+ File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 394, in _run_hydra
223
+ _run_app(
224
+ File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 457, in _run_app
225
+ run_and_report(
226
+ File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 220, in run_and_report
227
+ return func()
228
+ ^^^^^^
229
+ File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 458, in <lambda>
230
+ lambda: hydra.run(
231
+ ^^^^^^^^^^
232
+ File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/hydra.py", line 119, in run
233
+ ret = run_job(
234
+ ^^^^^^^^
235
+ File "/usr/local/lib/python3.11/dist-packages/hydra/core/utils.py", line 186, in run_job
236
+ ret.return_value = task_function(task_cfg)
237
+ ^^^^^^^^^^^^^^^^^^^^^^^
238
+ File "/workspace/nanoT5/nanoT5/main.py", line 75, in main
239
+ train(
240
+ File "/workspace/nanoT5/nanoT5/utils/train_utils.py", line 197, in train
241
+ for batch_id, batch in enumerate(train_dataloader, start=1):
242
+ File "/usr/local/lib/python3.11/dist-packages/accelerate/data_loader.py", line 685, in __iter__
243
+ batch = send_to_device(batch, self.state.device, non_blocking=self._non_blocking)
244
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
245
+ File "/usr/local/lib/python3.11/dist-packages/accelerate/utils/operations.py", line 183, in send_to_device
246
+ {
247
+ File "/usr/local/lib/python3.11/dist-packages/accelerate/utils/operations.py", line 184, in <dictcomp>
248
+ k: t if k in skip_keys else send_to_device(t, device, non_blocking=non_blocking, skip_keys=skip_keys)
249
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
250
+ File "/usr/local/lib/python3.11/dist-packages/accelerate/utils/operations.py", line 155, in send_to_device
251
+ return tensor.to(device, non_blocking=non_blocking)
252
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
253
+ KeyboardInterrupt
checkpoints/wandb/run-20240902_170304-v43qltex/files/requirements.txt ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GitPython==3.1.43
2
+ Jinja2==3.1.4
3
+ MarkupSafe==2.1.5
4
+ PyGObject==3.42.1
5
+ PyJWT==2.3.0
6
+ PyYAML==6.0.2
7
+ Pygments==2.18.0
8
+ SecretStorage==3.3.1
9
+ Send2Trash==1.8.3
10
+ absl-py==2.1.0
11
+ accelerate==0.33.0
12
+ aiohappyeyeballs==2.4.0
13
+ aiohttp==3.10.5
14
+ aiosignal==1.3.1
15
+ antlr4-python3-runtime==4.9.3
16
+ anyio==4.4.0
17
+ argon2-cffi-bindings==21.2.0
18
+ argon2-cffi==23.1.0
19
+ arrow==1.3.0
20
+ asttokens==2.4.1
21
+ async-lru==2.0.4
22
+ attrs==24.2.0
23
+ babel==2.16.0
24
+ beautifulsoup4==4.12.3
25
+ bleach==6.1.0
26
+ blinker==1.4
27
+ certifi==2024.7.4
28
+ cffi==1.17.0
29
+ charset-normalizer==3.3.2
30
+ click==8.1.7
31
+ comm==0.2.2
32
+ cryptography==3.4.8
33
+ datasets==2.21.0
34
+ dbus-python==1.2.18
35
+ debugpy==1.8.5
36
+ decorator==5.1.1
37
+ defusedxml==0.7.1
38
+ dill==0.3.8
39
+ distro==1.7.0
40
+ docker-pycreds==0.4.0
41
+ einops==0.8.0
42
+ entrypoints==0.4
43
+ evaluate==0.4.2
44
+ executing==2.0.1
45
+ fancycompleter==0.9.1
46
+ fastjsonschema==2.20.0
47
+ filelock==3.15.4
48
+ flash-attn==2.6.3
49
+ fqdn==1.5.1
50
+ frozenlist==1.4.1
51
+ fsspec==2024.6.1
52
+ gitdb==4.0.11
53
+ h11==0.14.0
54
+ httpcore==1.0.5
55
+ httplib2==0.20.2
56
+ httpx==0.27.0
57
+ huggingface-hub==0.24.6
58
+ hydra-core==1.3.2
59
+ idna==3.7
60
+ importlib-metadata==4.6.4
61
+ ipykernel==6.29.5
62
+ ipython-genutils==0.2.0
63
+ ipython==8.26.0
64
+ ipywidgets==8.1.3
65
+ isoduration==20.11.0
66
+ jedi==0.19.1
67
+ jeepney==0.7.1
68
+ joblib==1.4.2
69
+ json5==0.9.25
70
+ jsonpointer==3.0.0
71
+ jsonschema-specifications==2023.12.1
72
+ jsonschema==4.23.0
73
+ jupyter-archive==3.4.0
74
+ jupyter-events==0.10.0
75
+ jupyter-highlight-selected-word==0.2.0
76
+ jupyter-lsp==2.2.5
77
+ jupyter_client==7.4.9
78
+ jupyter_contrib_core==0.4.2
79
+ jupyter_contrib_nbextensions==0.7.0
80
+ jupyter_core==5.7.2
81
+ jupyter_nbextensions_configurator==0.6.4
82
+ jupyter_server==2.14.2
83
+ jupyter_server_terminals==0.5.3
84
+ jupyterlab==4.2.4
85
+ jupyterlab_pygments==0.3.0
86
+ jupyterlab_server==2.27.3
87
+ jupyterlab_widgets==3.0.11
88
+ keyring==23.5.0
89
+ launchpadlib==1.10.16
90
+ lazr.restfulclient==0.14.4
91
+ lazr.uri==1.0.6
92
+ liger-kernel==0.2.1
93
+ lxml==5.3.0
94
+ matplotlib-inline==0.1.7
95
+ mistune==3.0.2
96
+ more-itertools==8.10.0
97
+ mpmath==1.3.0
98
+ multidict==6.0.5
99
+ multiprocess==0.70.16
100
+ nbclassic==1.1.0
101
+ nbclient==0.10.0
102
+ nbconvert==7.16.4
103
+ nbformat==5.10.4
104
+ nest-asyncio==1.6.0
105
+ networkx==3.3
106
+ nltk==3.9.1
107
+ notebook==6.5.5
108
+ notebook_shim==0.2.4
109
+ numpy==1.26.4
110
+ nvidia-cublas-cu12==12.1.3.1
111
+ nvidia-cuda-cupti-cu12==12.1.105
112
+ nvidia-cuda-nvrtc-cu12==12.1.105
113
+ nvidia-cuda-runtime-cu12==12.1.105
114
+ nvidia-cudnn-cu12==9.1.0.70
115
+ nvidia-cufft-cu12==11.0.2.54
116
+ nvidia-curand-cu12==10.3.2.106
117
+ nvidia-cusolver-cu12==11.4.5.107
118
+ nvidia-cusparse-cu12==12.1.0.106
119
+ nvidia-nccl-cu12==2.20.5
120
+ nvidia-nvjitlink-cu12==12.6.20
121
+ nvidia-nvtx-cu12==12.1.105
122
+ oauthlib==3.2.0
123
+ omegaconf==2.3.0
124
+ overrides==7.7.0
125
+ packaging==24.1
126
+ pandas==2.2.2
127
+ pandocfilters==1.5.1
128
+ parso==0.8.4
129
+ pdbpp==0.10.3
130
+ pexpect==4.9.0
131
+ pillow==10.4.0
132
+ pip==24.2
133
+ platformdirs==4.2.2
134
+ prometheus_client==0.20.0
135
+ prompt_toolkit==3.0.47
136
+ protobuf==3.20.3
137
+ psutil==6.0.0
138
+ ptyprocess==0.7.0
139
+ pure_eval==0.2.3
140
+ pyarrow==17.0.0
141
+ pycparser==2.22
142
+ pynvml==11.5.3
143
+ pyparsing==2.4.7
144
+ pyrepl==0.9.0
145
+ python-apt==2.4.0+ubuntu3
146
+ python-dateutil==2.9.0.post0
147
+ python-json-logger==2.0.7
148
+ pytz==2024.1
149
+ pyzmq==24.0.1
150
+ referencing==0.35.1
151
+ regex==2024.7.24
152
+ requests==2.32.3
153
+ rfc3339-validator==0.1.4
154
+ rfc3986-validator==0.1.1
155
+ rouge_score==0.1.2
156
+ rpds-py==0.20.0
157
+ safetensors==0.4.4
158
+ sentencepiece==0.2.0
159
+ sentry-sdk==2.13.0
160
+ setproctitle==1.3.3
161
+ setuptools==73.0.1
162
+ six==1.16.0
163
+ smmap==5.0.1
164
+ sniffio==1.3.1
165
+ soupsieve==2.6
166
+ stack-data==0.6.3
167
+ sympy==1.13.2
168
+ terminado==0.18.1
169
+ tinycss2==1.3.0
170
+ tokenizers==0.19.1
171
+ torch==2.4.0
172
+ torchaudio==2.4.0
173
+ torchvision==0.19.0
174
+ tornado==6.4.1
175
+ tqdm==4.66.5
176
+ traitlets==5.14.3
177
+ transformers==4.44.2
178
+ triton==3.0.0
179
+ types-python-dateutil==2.9.0.20240821
180
+ typing_extensions==4.12.2
181
+ tzdata==2024.1
182
+ uri-template==1.3.0
183
+ urllib3==2.2.2
184
+ wadllib==1.3.6
185
+ wandb==0.17.8
186
+ wcwidth==0.2.13
187
+ webcolors==24.8.0
188
+ webencodings==0.5.1
189
+ websocket-client==1.8.0
190
+ wheel==0.44.0
191
+ widgetsnbextension==4.0.11
192
+ wmctrl==0.5
193
+ xxhash==3.5.0
194
+ yarl==1.9.7
195
+ zipp==1.0.0
checkpoints/wandb/run-20240902_170304-v43qltex/files/wandb-metadata.json ADDED
@@ -0,0 +1,527 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-171-generic-x86_64-with-glibc2.35",
3
+ "python": "3.11.9",
4
+ "heartbeatAt": "2024-09-02T17:03:05.625568",
5
+ "startedAt": "2024-09-02T17:03:04.729800",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [],
9
+ "state": "running",
10
+ "program": "-m nanoT5.main",
11
+ "codePathLocal": null,
12
+ "git": {
13
+ "remote": "https://github.com/pszemraj/nanoT5.git",
14
+ "commit": "7e55b4db2270303afebba4e0d389b68979943c0c"
15
+ },
16
+ "email": null,
17
+ "root": "/workspace/nanoT5",
18
+ "host": "f8d7d6f6310f",
19
+ "username": "root",
20
+ "executable": "/usr/bin/python",
21
+ "cpu_count": 48,
22
+ "cpu_count_logical": 96,
23
+ "cpu_freq": {
24
+ "current": 1001.9331562499998,
25
+ "min": 800.0,
26
+ "max": 2801.0
27
+ },
28
+ "cpu_freq_per_core": [
29
+ {
30
+ "current": 900.0,
31
+ "min": 800.0,
32
+ "max": 2801.0
33
+ },
34
+ {
35
+ "current": 799.402,
36
+ "min": 800.0,
37
+ "max": 2801.0
38
+ },
39
+ {
40
+ "current": 800.0,
41
+ "min": 800.0,
42
+ "max": 2801.0
43
+ },
44
+ {
45
+ "current": 799.398,
46
+ "min": 800.0,
47
+ "max": 2801.0
48
+ },
49
+ {
50
+ "current": 800.0,
51
+ "min": 800.0,
52
+ "max": 2801.0
53
+ },
54
+ {
55
+ "current": 1037.385,
56
+ "min": 800.0,
57
+ "max": 2801.0
58
+ },
59
+ {
60
+ "current": 800.0,
61
+ "min": 800.0,
62
+ "max": 2801.0
63
+ },
64
+ {
65
+ "current": 872.196,
66
+ "min": 800.0,
67
+ "max": 2801.0
68
+ },
69
+ {
70
+ "current": 800.0,
71
+ "min": 800.0,
72
+ "max": 2801.0
73
+ },
74
+ {
75
+ "current": 1072.759,
76
+ "min": 800.0,
77
+ "max": 2801.0
78
+ },
79
+ {
80
+ "current": 882.854,
81
+ "min": 800.0,
82
+ "max": 2801.0
83
+ },
84
+ {
85
+ "current": 823.861,
86
+ "min": 800.0,
87
+ "max": 2801.0
88
+ },
89
+ {
90
+ "current": 823.212,
91
+ "min": 800.0,
92
+ "max": 2801.0
93
+ },
94
+ {
95
+ "current": 800.0,
96
+ "min": 800.0,
97
+ "max": 2801.0
98
+ },
99
+ {
100
+ "current": 799.39,
101
+ "min": 800.0,
102
+ "max": 2801.0
103
+ },
104
+ {
105
+ "current": 823.065,
106
+ "min": 800.0,
107
+ "max": 2801.0
108
+ },
109
+ {
110
+ "current": 1398.392,
111
+ "min": 800.0,
112
+ "max": 2801.0
113
+ },
114
+ {
115
+ "current": 799.401,
116
+ "min": 800.0,
117
+ "max": 2801.0
118
+ },
119
+ {
120
+ "current": 800.0,
121
+ "min": 800.0,
122
+ "max": 2801.0
123
+ },
124
+ {
125
+ "current": 824.357,
126
+ "min": 800.0,
127
+ "max": 2801.0
128
+ },
129
+ {
130
+ "current": 800.0,
131
+ "min": 800.0,
132
+ "max": 2801.0
133
+ },
134
+ {
135
+ "current": 2022.695,
136
+ "min": 800.0,
137
+ "max": 2801.0
138
+ },
139
+ {
140
+ "current": 3300.0,
141
+ "min": 800.0,
142
+ "max": 2801.0
143
+ },
144
+ {
145
+ "current": 800.0,
146
+ "min": 800.0,
147
+ "max": 2801.0
148
+ },
149
+ {
150
+ "current": 823.663,
151
+ "min": 800.0,
152
+ "max": 2801.0
153
+ },
154
+ {
155
+ "current": 800.0,
156
+ "min": 800.0,
157
+ "max": 2801.0
158
+ },
159
+ {
160
+ "current": 804.641,
161
+ "min": 800.0,
162
+ "max": 2801.0
163
+ },
164
+ {
165
+ "current": 873.253,
166
+ "min": 800.0,
167
+ "max": 2801.0
168
+ },
169
+ {
170
+ "current": 872.742,
171
+ "min": 800.0,
172
+ "max": 2801.0
173
+ },
174
+ {
175
+ "current": 800.0,
176
+ "min": 800.0,
177
+ "max": 2801.0
178
+ },
179
+ {
180
+ "current": 800.0,
181
+ "min": 800.0,
182
+ "max": 2801.0
183
+ },
184
+ {
185
+ "current": 799.845,
186
+ "min": 800.0,
187
+ "max": 2801.0
188
+ },
189
+ {
190
+ "current": 799.422,
191
+ "min": 800.0,
192
+ "max": 2801.0
193
+ },
194
+ {
195
+ "current": 800.0,
196
+ "min": 800.0,
197
+ "max": 2801.0
198
+ },
199
+ {
200
+ "current": 900.0,
201
+ "min": 800.0,
202
+ "max": 2801.0
203
+ },
204
+ {
205
+ "current": 799.957,
206
+ "min": 800.0,
207
+ "max": 2801.0
208
+ },
209
+ {
210
+ "current": 1100.0,
211
+ "min": 800.0,
212
+ "max": 2801.0
213
+ },
214
+ {
215
+ "current": 3351.928,
216
+ "min": 800.0,
217
+ "max": 2801.0
218
+ },
219
+ {
220
+ "current": 800.0,
221
+ "min": 800.0,
222
+ "max": 2801.0
223
+ },
224
+ {
225
+ "current": 799.426,
226
+ "min": 800.0,
227
+ "max": 2801.0
228
+ },
229
+ {
230
+ "current": 870.565,
231
+ "min": 800.0,
232
+ "max": 2801.0
233
+ },
234
+ {
235
+ "current": 800.192,
236
+ "min": 800.0,
237
+ "max": 2801.0
238
+ },
239
+ {
240
+ "current": 800.057,
241
+ "min": 800.0,
242
+ "max": 2801.0
243
+ },
244
+ {
245
+ "current": 2799.997,
246
+ "min": 800.0,
247
+ "max": 2801.0
248
+ },
249
+ {
250
+ "current": 800.0,
251
+ "min": 800.0,
252
+ "max": 2801.0
253
+ },
254
+ {
255
+ "current": 959.262,
256
+ "min": 800.0,
257
+ "max": 2801.0
258
+ },
259
+ {
260
+ "current": 2801.291,
261
+ "min": 800.0,
262
+ "max": 2801.0
263
+ },
264
+ {
265
+ "current": 799.425,
266
+ "min": 800.0,
267
+ "max": 2801.0
268
+ },
269
+ {
270
+ "current": 900.0,
271
+ "min": 800.0,
272
+ "max": 2801.0
273
+ },
274
+ {
275
+ "current": 800.683,
276
+ "min": 800.0,
277
+ "max": 2801.0
278
+ },
279
+ {
280
+ "current": 800.0,
281
+ "min": 800.0,
282
+ "max": 2801.0
283
+ },
284
+ {
285
+ "current": 800.697,
286
+ "min": 800.0,
287
+ "max": 2801.0
288
+ },
289
+ {
290
+ "current": 800.876,
291
+ "min": 800.0,
292
+ "max": 2801.0
293
+ },
294
+ {
295
+ "current": 800.0,
296
+ "min": 800.0,
297
+ "max": 2801.0
298
+ },
299
+ {
300
+ "current": 800.0,
301
+ "min": 800.0,
302
+ "max": 2801.0
303
+ },
304
+ {
305
+ "current": 800.0,
306
+ "min": 800.0,
307
+ "max": 2801.0
308
+ },
309
+ {
310
+ "current": 800.0,
311
+ "min": 800.0,
312
+ "max": 2801.0
313
+ },
314
+ {
315
+ "current": 800.0,
316
+ "min": 800.0,
317
+ "max": 2801.0
318
+ },
319
+ {
320
+ "current": 800.741,
321
+ "min": 800.0,
322
+ "max": 2801.0
323
+ },
324
+ {
325
+ "current": 800.0,
326
+ "min": 800.0,
327
+ "max": 2801.0
328
+ },
329
+ {
330
+ "current": 800.0,
331
+ "min": 800.0,
332
+ "max": 2801.0
333
+ },
334
+ {
335
+ "current": 800.0,
336
+ "min": 800.0,
337
+ "max": 2801.0
338
+ },
339
+ {
340
+ "current": 800.0,
341
+ "min": 800.0,
342
+ "max": 2801.0
343
+ },
344
+ {
345
+ "current": 800.0,
346
+ "min": 800.0,
347
+ "max": 2801.0
348
+ },
349
+ {
350
+ "current": 942.364,
351
+ "min": 800.0,
352
+ "max": 2801.0
353
+ },
354
+ {
355
+ "current": 800.344,
356
+ "min": 800.0,
357
+ "max": 2801.0
358
+ },
359
+ {
360
+ "current": 799.272,
361
+ "min": 800.0,
362
+ "max": 2801.0
363
+ },
364
+ {
365
+ "current": 800.0,
366
+ "min": 800.0,
367
+ "max": 2801.0
368
+ },
369
+ {
370
+ "current": 800.0,
371
+ "min": 800.0,
372
+ "max": 2801.0
373
+ },
374
+ {
375
+ "current": 3300.0,
376
+ "min": 800.0,
377
+ "max": 2801.0
378
+ },
379
+ {
380
+ "current": 3304.817,
381
+ "min": 800.0,
382
+ "max": 2801.0
383
+ },
384
+ {
385
+ "current": 800.103,
386
+ "min": 800.0,
387
+ "max": 2801.0
388
+ },
389
+ {
390
+ "current": 800.363,
391
+ "min": 800.0,
392
+ "max": 2801.0
393
+ },
394
+ {
395
+ "current": 800.727,
396
+ "min": 800.0,
397
+ "max": 2801.0
398
+ },
399
+ {
400
+ "current": 900.0,
401
+ "min": 800.0,
402
+ "max": 2801.0
403
+ },
404
+ {
405
+ "current": 900.0,
406
+ "min": 800.0,
407
+ "max": 2801.0
408
+ },
409
+ {
410
+ "current": 900.0,
411
+ "min": 800.0,
412
+ "max": 2801.0
413
+ },
414
+ {
415
+ "current": 811.831,
416
+ "min": 800.0,
417
+ "max": 2801.0
418
+ },
419
+ {
420
+ "current": 799.938,
421
+ "min": 800.0,
422
+ "max": 2801.0
423
+ },
424
+ {
425
+ "current": 800.0,
426
+ "min": 800.0,
427
+ "max": 2801.0
428
+ },
429
+ {
430
+ "current": 801.226,
431
+ "min": 800.0,
432
+ "max": 2801.0
433
+ },
434
+ {
435
+ "current": 799.947,
436
+ "min": 800.0,
437
+ "max": 2801.0
438
+ },
439
+ {
440
+ "current": 900.0,
441
+ "min": 800.0,
442
+ "max": 2801.0
443
+ },
444
+ {
445
+ "current": 800.0,
446
+ "min": 800.0,
447
+ "max": 2801.0
448
+ },
449
+ {
450
+ "current": 980.682,
451
+ "min": 800.0,
452
+ "max": 2801.0
453
+ },
454
+ {
455
+ "current": 3308.926,
456
+ "min": 800.0,
457
+ "max": 2801.0
458
+ },
459
+ {
460
+ "current": 801.074,
461
+ "min": 800.0,
462
+ "max": 2801.0
463
+ },
464
+ {
465
+ "current": 800.709,
466
+ "min": 800.0,
467
+ "max": 2801.0
468
+ },
469
+ {
470
+ "current": 804.122,
471
+ "min": 800.0,
472
+ "max": 2801.0
473
+ },
474
+ {
475
+ "current": 801.051,
476
+ "min": 800.0,
477
+ "max": 2801.0
478
+ },
479
+ {
480
+ "current": 805.622,
481
+ "min": 800.0,
482
+ "max": 2801.0
483
+ },
484
+ {
485
+ "current": 2800.0,
486
+ "min": 800.0,
487
+ "max": 2801.0
488
+ },
489
+ {
490
+ "current": 799.951,
491
+ "min": 800.0,
492
+ "max": 2801.0
493
+ },
494
+ {
495
+ "current": 800.0,
496
+ "min": 800.0,
497
+ "max": 2801.0
498
+ },
499
+ {
500
+ "current": 2802.488,
501
+ "min": 800.0,
502
+ "max": 2801.0
503
+ },
504
+ {
505
+ "current": 801.049,
506
+ "min": 800.0,
507
+ "max": 2801.0
508
+ }
509
+ ],
510
+ "disk": {
511
+ "/": {
512
+ "total": 200.0,
513
+ "used": 1.4021186828613281
514
+ }
515
+ },
516
+ "gpu": "NVIDIA A40",
517
+ "gpu_count": 1,
518
+ "gpu_devices": [
519
+ {
520
+ "name": "NVIDIA A40",
521
+ "memory_total": 48305799168
522
+ }
523
+ ],
524
+ "memory": {
525
+ "total": 503.5313262939453
526
+ }
527
+ }
checkpoints/wandb/run-20240902_170304-v43qltex/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss": 42.73570467710495, "train/grad_l2": 1292.6591796875, "train/weights_l2": 7206.464049045997, "train/lr": 0.019987049260593165, "train/seconds_per_step": 3.309874153137207, "_timestamp": 1725333143.7322862, "_runtime": 36558.970363140106, "_step": 10900, "_wandb": {"runtime": 36838}}
checkpoints/wandb/run-20240902_170304-v43qltex/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/wandb/run-20240902_170304-v43qltex/logs/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Current SDK version is 0.17.8
2
+ 2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Configure stats pid to 6499
3
+ 2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
4
+ 2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/settings
5
+ 2024-09-02 17:03:04,742 INFO MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
6
+ 2024-09-02 17:03:04,742 INFO MainThread:6499 [wandb_setup.py:_flush():77] Applying setup settings: {'_disable_service': False}
7
+ 2024-09-02 17:03:04,742 WARNING MainThread:6499 [wandb_setup.py:_flush():77] Could not find program at -m nanoT5.main
8
+ 2024-09-02 17:03:04,743 INFO MainThread:6499 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
9
+ 2024-09-02 17:03:04,743 INFO MainThread:6499 [wandb_setup.py:_flush():77] Applying login settings: {}
10
+ 2024-09-02 17:03:04,743 INFO MainThread:6499 [wandb_init.py:_log_setup():524] Logging user logs to /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/run-20240902_170304-v43qltex/logs/debug.log
11
+ 2024-09-02 17:03:04,744 INFO MainThread:6499 [wandb_init.py:_log_setup():525] Logging internal logs to /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/run-20240902_170304-v43qltex/logs/debug-internal.log
12
+ 2024-09-02 17:03:04,744 INFO MainThread:6499 [wandb_init.py:init():607] calling init triggers
13
+ 2024-09-02 17:03:04,744 INFO MainThread:6499 [wandb_init.py:init():614] wandb.init called with sweep_config: {}
14
+ config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 2137, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'model': {'klass': 'custom_seq2seq', 'name': 'google/t5-v1_1-base', 'overwrite': None, 'add_config': None, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 8}, 'optim': {'name': 'adamwscale', 'base_lr': 0.02, 'batch_size': 64, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 10000, 'lr_scheduler': 'cosine', 'weight_decay': 0.001, 'grad_clip': 1.0, 'grad_acc': 4, 'final_cosine': 1e-05}, 'eval': {'every_steps': 100000, 'steps': 500}, 'checkpoint': {'every_steps': 5000}, 'logging': {'every_steps': 100, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nano-custom-seq2seq', 'entity': 'amazingvince', 'tags': ['nanoT5', 'my_tag'], 'mode': 'online'}}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/logs/2024-09-02/17-03-02'}
15
+ 2024-09-02 17:03:04,745 INFO MainThread:6499 [wandb_init.py:init():657] starting backend
16
+ 2024-09-02 17:03:04,745 INFO MainThread:6499 [wandb_init.py:init():661] setting up manager
17
+ 2024-09-02 17:03:04,760 INFO MainThread:6499 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2024-09-02 17:03:04,761 INFO MainThread:6499 [wandb_init.py:init():669] backend started and connected
19
+ 2024-09-02 17:03:04,776 INFO MainThread:6499 [wandb_init.py:init():767] updated telemetry
20
+ 2024-09-02 17:03:04,819 INFO MainThread:6499 [wandb_init.py:init():800] communicating run to backend with 90.0 second timeout
21
+ 2024-09-02 17:03:05,519 INFO MainThread:6499 [wandb_init.py:init():851] starting run threads in backend
22
+ 2024-09-02 17:03:05,817 INFO MainThread:6499 [wandb_run.py:_console_start():2463] atexit reg
23
+ 2024-09-02 17:03:05,818 INFO MainThread:6499 [wandb_run.py:_redirect():2309] redirect: wrap_raw
24
+ 2024-09-02 17:03:05,819 INFO MainThread:6499 [wandb_run.py:_redirect():2374] Wrapping output streams.
25
+ 2024-09-02 17:03:05,819 INFO MainThread:6499 [wandb_run.py:_redirect():2399] Redirects installed.
26
+ 2024-09-02 17:03:05,822 INFO MainThread:6499 [wandb_init.py:init():894] run started, returning control to user process
27
+ 2024-09-02 17:03:35,512 INFO MainThread:6499 [wandb_run.py:_config_callback():1392] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 2137, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'model': {'klass': 'custom_seq2seq', 'name': 'google/t5-v1_1-base', 'overwrite': None, 'add_config': None, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 8, 'before_mask_input_length': 1137, 'target_length': 229}, 'optim': {'name': 'adamwscale', 'base_lr': 0.02, 'batch_size': 64, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 10000, 'lr_scheduler': 'cosine', 'weight_decay': 0.001, 'grad_clip': 1.0, 'grad_acc': 4, 'final_cosine': 1e-05}, 'eval': {'every_steps': 100000, 'steps': 500, 'corrected_steps': 500}, 'checkpoint': {'every_steps': 5000}, 'logging': {'every_steps': 100, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nano-custom-seq2seq', 'entity': 'amazingvince', 'tags': ['nanoT5', 'my_tag'], 'mode': 'online'}}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/logs/2024-09-02/17-03-02', 'n_all_param': 673076736}
28
+ 2024-09-03 03:17:10,763 WARNING MsgRouterThr:6499 [router.py:message_loop():77] message_loop has been closed
checkpoints/wandb/run-20240902_170304-v43qltex/run-v43qltex.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c992f1fa0f2152d9b2336fede3ebcc8b26d60d54a1945da3d2cecab44ce3ab70
3
+ size 4157001