Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- checkpoints/.hydra/config.yaml +50 -0
- checkpoints/.hydra/hydra.yaml +156 -0
- checkpoints/.hydra/overrides.yaml +1 -0
- checkpoints/checkpoint-pt-10000/model.safetensors +3 -0
- checkpoints/checkpoint-pt-10000/random_states_0.pkl +3 -0
- checkpoints/checkpoint-pt-5000/model.safetensors +3 -0
- checkpoints/checkpoint-pt-5000/random_states_0.pkl +3 -0
- checkpoints/config.json +26 -0
- checkpoints/main.log +135 -0
- checkpoints/wandb/debug-internal.log +0 -0
- checkpoints/wandb/debug.log +28 -0
- checkpoints/wandb/run-20240902_170304-v43qltex/files/config.yaml +132 -0
- checkpoints/wandb/run-20240902_170304-v43qltex/files/output.log +253 -0
- checkpoints/wandb/run-20240902_170304-v43qltex/files/requirements.txt +195 -0
- checkpoints/wandb/run-20240902_170304-v43qltex/files/wandb-metadata.json +527 -0
- checkpoints/wandb/run-20240902_170304-v43qltex/files/wandb-summary.json +1 -0
- checkpoints/wandb/run-20240902_170304-v43qltex/logs/debug-internal.log +0 -0
- checkpoints/wandb/run-20240902_170304-v43qltex/logs/debug.log +28 -0
- checkpoints/wandb/run-20240902_170304-v43qltex/run-v43qltex.wandb +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
checkpoints/wandb/run-20240902_170304-v43qltex/run-v43qltex.wandb filter=lfs diff=lfs merge=lfs -text
|
checkpoints/.hydra/config.yaml
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mode: pt
|
2 |
+
device: gpu
|
3 |
+
precision: bf16
|
4 |
+
eval_only: false
|
5 |
+
predict_only: false
|
6 |
+
seed: 2137
|
7 |
+
tokenizer:
|
8 |
+
name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
|
9 |
+
model:
|
10 |
+
klass: custom_seq2seq
|
11 |
+
name: google/t5-v1_1-base
|
12 |
+
overwrite: null
|
13 |
+
add_config: null
|
14 |
+
checkpoint_path: ''
|
15 |
+
random_init: true
|
16 |
+
compile: true
|
17 |
+
data:
|
18 |
+
input_length: 1024
|
19 |
+
mlm_probability: 0.15
|
20 |
+
mean_noise_span_length: 3.0
|
21 |
+
num_workers: 8
|
22 |
+
optim:
|
23 |
+
name: adamwscale
|
24 |
+
base_lr: 0.02
|
25 |
+
batch_size: 64
|
26 |
+
total_steps: 65536
|
27 |
+
epochs: -1
|
28 |
+
warmup_steps: 10000
|
29 |
+
lr_scheduler: cosine
|
30 |
+
weight_decay: 0.001
|
31 |
+
grad_clip: 1.0
|
32 |
+
grad_acc: 4
|
33 |
+
final_cosine: 1.0e-05
|
34 |
+
eval:
|
35 |
+
every_steps: 100000
|
36 |
+
steps: 500
|
37 |
+
checkpoint:
|
38 |
+
every_steps: 5000
|
39 |
+
logging:
|
40 |
+
every_steps: 100
|
41 |
+
grad_l2: true
|
42 |
+
weights_l2: true
|
43 |
+
use_wandb: true
|
44 |
+
wandb_config:
|
45 |
+
project: nano-custom-seq2seq
|
46 |
+
entity: amazingvince
|
47 |
+
tags:
|
48 |
+
- nanoT5
|
49 |
+
- my_tag
|
50 |
+
mode: online
|
checkpoints/.hydra/hydra.yaml
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task: []
|
115 |
+
job:
|
116 |
+
name: main
|
117 |
+
chdir: true
|
118 |
+
override_dirname: ''
|
119 |
+
id: ???
|
120 |
+
num: ???
|
121 |
+
config_name: default
|
122 |
+
env_set: {}
|
123 |
+
env_copy: []
|
124 |
+
config:
|
125 |
+
override_dirname:
|
126 |
+
kv_sep: '='
|
127 |
+
item_sep: ','
|
128 |
+
exclude_keys: []
|
129 |
+
runtime:
|
130 |
+
version: 1.3.2
|
131 |
+
version_base: '1.1'
|
132 |
+
cwd: /workspace/nanoT5
|
133 |
+
config_sources:
|
134 |
+
- path: hydra.conf
|
135 |
+
schema: pkg
|
136 |
+
provider: hydra
|
137 |
+
- path: /workspace/nanoT5/nanoT5/configs
|
138 |
+
schema: file
|
139 |
+
provider: main
|
140 |
+
- path: ''
|
141 |
+
schema: structured
|
142 |
+
provider: schema
|
143 |
+
output_dir: /workspace/nanoT5/logs/2024-09-02/17-03-02
|
144 |
+
choices:
|
145 |
+
local_env: default
|
146 |
+
task: pt
|
147 |
+
hydra/env: default
|
148 |
+
hydra/callbacks: null
|
149 |
+
hydra/job_logging: default
|
150 |
+
hydra/hydra_logging: default
|
151 |
+
hydra/hydra_help: default
|
152 |
+
hydra/help: default
|
153 |
+
hydra/sweeper: basic
|
154 |
+
hydra/launcher: basic
|
155 |
+
hydra/output: default
|
156 |
+
verbose: false
|
checkpoints/.hydra/overrides.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
checkpoints/checkpoint-pt-10000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de209ea75162ba234b5a0ac46f2434ad29c106e4770c8c587eba8ac390f7fede
|
3 |
+
size 2692370584
|
checkpoints/checkpoint-pt-10000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50b14a8e8e4cb1f87530bb13452da585006a1a54e1fa02069afa73d0775f0736
|
3 |
+
size 14344
|
checkpoints/checkpoint-pt-5000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7134d1e0be9b58fd378c0b681679ae41855daa9cbb0dc80b24e826ef59861ce
|
3 |
+
size 2692370584
|
checkpoints/checkpoint-pt-5000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50b14a8e8e4cb1f87530bb13452da585006a1a54e1fa02069afa73d0775f0736
|
3 |
+
size 14344
|
checkpoints/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"attention_probs_dropout_prob": 0.0,
|
3 |
+
"bos_token_id": 1,
|
4 |
+
"decoder_start_token_id": 3,
|
5 |
+
"eos_token_id": 2,
|
6 |
+
"head_dim": 64,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_dropout_prob": 0.0,
|
9 |
+
"hidden_size": 1024,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"intermediate_size": 3072,
|
12 |
+
"layer_norm_eps": 1e-12,
|
13 |
+
"max_position_embeddings": 1024,
|
14 |
+
"num_attention_heads": 16,
|
15 |
+
"num_decoder_layers": 32,
|
16 |
+
"num_encoder_layers": 16,
|
17 |
+
"num_key_value_heads": 4,
|
18 |
+
"pad_token_id": 3,
|
19 |
+
"rotary_emb_base": 10000.0,
|
20 |
+
"rotary_emb_dim": 32,
|
21 |
+
"rotary_emb_interleaved": false,
|
22 |
+
"rotary_emb_scale_base": null,
|
23 |
+
"transformers_version": "4.44.2",
|
24 |
+
"use_cache": true,
|
25 |
+
"vocab_size": 48256
|
26 |
+
}
|
checkpoints/main.log
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[2024-09-02 17:03:02,219][accelerate.utils.other][WARNING] - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
|
2 |
+
[2024-09-02 17:03:02,226][Main][INFO] - Distributed environment: DistributedType.NO
|
3 |
+
Num processes: 1
|
4 |
+
Process index: 0
|
5 |
+
Local process index: 0
|
6 |
+
Device: cuda
|
7 |
+
|
8 |
+
Mixed precision type: bf16
|
9 |
+
|
10 |
+
[2024-09-02 17:03:02,227][Main][INFO] - Working directory is /workspace/nanoT5/logs/2024-09-02/17-03-02
|
11 |
+
[2024-09-02 17:14:53,691][Main][INFO] - [train] Step 100 out of 65536 | Loss --> 51.971 | Grad_l2 --> 82.676 | Weights_l2 --> 7042.062 | Lr --> 0.010 | Seconds_per_step --> 6.760 |
|
12 |
+
[2024-09-02 17:20:23,699][Main][INFO] - [train] Step 200 out of 65536 | Loss --> 14.150 | Grad_l2 --> 19.390 | Weights_l2 --> 7034.376 | Lr --> 0.010 | Seconds_per_step --> 3.300 |
|
13 |
+
[2024-09-02 17:25:54,840][Main][INFO] - [train] Step 300 out of 65536 | Loss --> 9.006 | Grad_l2 --> 9.061 | Weights_l2 --> 7026.824 | Lr --> 0.010 | Seconds_per_step --> 3.311 |
|
14 |
+
[2024-09-02 17:31:26,095][Main][INFO] - [train] Step 400 out of 65536 | Loss --> 7.529 | Grad_l2 --> 5.889 | Weights_l2 --> 7019.014 | Lr --> 0.010 | Seconds_per_step --> 3.313 |
|
15 |
+
[2024-09-02 17:36:56,190][Main][INFO] - [train] Step 500 out of 65536 | Loss --> 6.618 | Grad_l2 --> 4.039 | Weights_l2 --> 7010.897 | Lr --> 0.011 | Seconds_per_step --> 3.301 |
|
16 |
+
[2024-09-02 17:42:27,693][Main][INFO] - [train] Step 600 out of 65536 | Loss --> 5.994 | Grad_l2 --> 2.962 | Weights_l2 --> 7002.549 | Lr --> 0.011 | Seconds_per_step --> 3.315 |
|
17 |
+
[2024-09-02 17:47:57,967][Main][INFO] - [train] Step 700 out of 65536 | Loss --> 5.703 | Grad_l2 --> 2.434 | Weights_l2 --> 6994.267 | Lr --> 0.011 | Seconds_per_step --> 3.303 |
|
18 |
+
[2024-09-02 17:53:29,228][Main][INFO] - [train] Step 800 out of 65536 | Loss --> 6.603 | Grad_l2 --> 6.221 | Weights_l2 --> 6985.927 | Lr --> 0.011 | Seconds_per_step --> 3.313 |
|
19 |
+
[2024-09-02 17:59:00,011][Main][INFO] - [train] Step 900 out of 65536 | Loss --> 5.408 | Grad_l2 --> 1.465 | Weights_l2 --> 6980.026 | Lr --> 0.011 | Seconds_per_step --> 3.308 |
|
20 |
+
[2024-09-02 18:04:30,275][Main][INFO] - [train] Step 1000 out of 65536 | Loss --> 5.311 | Grad_l2 --> 0.992 | Weights_l2 --> 6975.109 | Lr --> 0.011 | Seconds_per_step --> 3.303 |
|
21 |
+
[2024-09-02 18:10:01,468][Main][INFO] - [train] Step 1100 out of 65536 | Loss --> 5.241 | Grad_l2 --> 0.854 | Weights_l2 --> 6970.708 | Lr --> 0.011 | Seconds_per_step --> 3.312 |
|
22 |
+
[2024-09-02 18:15:33,362][Main][INFO] - [train] Step 1200 out of 65536 | Loss --> 5.180 | Grad_l2 --> 0.838 | Weights_l2 --> 6966.641 | Lr --> 0.011 | Seconds_per_step --> 3.319 |
|
23 |
+
[2024-09-02 18:21:03,902][Main][INFO] - [train] Step 1300 out of 65536 | Loss --> 5.126 | Grad_l2 --> 0.764 | Weights_l2 --> 6962.789 | Lr --> 0.011 | Seconds_per_step --> 3.305 |
|
24 |
+
[2024-09-02 18:26:35,349][Main][INFO] - [train] Step 1400 out of 65536 | Loss --> 5.088 | Grad_l2 --> 0.744 | Weights_l2 --> 6959.146 | Lr --> 0.011 | Seconds_per_step --> 3.314 |
|
25 |
+
[2024-09-02 18:32:06,048][Main][INFO] - [train] Step 1500 out of 65536 | Loss --> 5.046 | Grad_l2 --> 0.702 | Weights_l2 --> 6955.673 | Lr --> 0.012 | Seconds_per_step --> 3.307 |
|
26 |
+
[2024-09-02 18:37:37,903][Main][INFO] - [train] Step 1600 out of 65536 | Loss --> 5.007 | Grad_l2 --> 0.691 | Weights_l2 --> 6952.523 | Lr --> 0.012 | Seconds_per_step --> 3.319 |
|
27 |
+
[2024-09-02 18:43:09,723][Main][INFO] - [train] Step 1700 out of 65536 | Loss --> 4.973 | Grad_l2 --> 0.673 | Weights_l2 --> 6949.412 | Lr --> 0.012 | Seconds_per_step --> 3.318 |
|
28 |
+
[2024-09-02 18:48:40,909][Main][INFO] - [train] Step 1800 out of 65536 | Loss --> 4.943 | Grad_l2 --> 0.671 | Weights_l2 --> 6946.498 | Lr --> 0.012 | Seconds_per_step --> 3.312 |
|
29 |
+
[2024-09-02 18:54:13,524][Main][INFO] - [train] Step 1900 out of 65536 | Loss --> 4.929 | Grad_l2 --> 0.668 | Weights_l2 --> 6943.795 | Lr --> 0.012 | Seconds_per_step --> 3.326 |
|
30 |
+
[2024-09-02 18:59:45,500][Main][INFO] - [train] Step 2000 out of 65536 | Loss --> 4.894 | Grad_l2 --> 0.665 | Weights_l2 --> 6941.241 | Lr --> 0.012 | Seconds_per_step --> 3.320 |
|
31 |
+
[2024-09-02 19:05:16,395][Main][INFO] - [train] Step 2100 out of 65536 | Loss --> 4.881 | Grad_l2 --> 0.713 | Weights_l2 --> 6938.861 | Lr --> 0.012 | Seconds_per_step --> 3.309 |
|
32 |
+
[2024-09-02 19:10:48,520][Main][INFO] - [train] Step 2200 out of 65536 | Loss --> 4.853 | Grad_l2 --> 0.653 | Weights_l2 --> 6936.551 | Lr --> 0.012 | Seconds_per_step --> 3.321 |
|
33 |
+
[2024-09-02 19:16:19,278][Main][INFO] - [train] Step 2300 out of 65536 | Loss --> 4.829 | Grad_l2 --> 0.646 | Weights_l2 --> 6934.357 | Lr --> 0.012 | Seconds_per_step --> 3.308 |
|
34 |
+
[2024-09-02 19:21:51,370][Main][INFO] - [train] Step 2400 out of 65536 | Loss --> 4.790 | Grad_l2 --> 0.620 | Weights_l2 --> 6932.338 | Lr --> 0.012 | Seconds_per_step --> 3.321 |
|
35 |
+
[2024-09-02 19:27:23,544][Main][INFO] - [train] Step 2500 out of 65536 | Loss --> 4.784 | Grad_l2 --> 0.643 | Weights_l2 --> 6930.395 | Lr --> 0.013 | Seconds_per_step --> 3.322 |
|
36 |
+
[2024-09-02 19:32:54,341][Main][INFO] - [train] Step 2600 out of 65536 | Loss --> 4.755 | Grad_l2 --> 0.623 | Weights_l2 --> 6928.543 | Lr --> 0.013 | Seconds_per_step --> 3.308 |
|
37 |
+
[2024-09-02 19:38:25,942][Main][INFO] - [train] Step 2700 out of 65536 | Loss --> 4.743 | Grad_l2 --> 0.636 | Weights_l2 --> 6926.944 | Lr --> 0.013 | Seconds_per_step --> 3.316 |
|
38 |
+
[2024-09-02 19:43:57,708][Main][INFO] - [train] Step 2800 out of 65536 | Loss --> 4.722 | Grad_l2 --> 0.590 | Weights_l2 --> 6925.379 | Lr --> 0.013 | Seconds_per_step --> 3.318 |
|
39 |
+
[2024-09-02 19:49:28,285][Main][INFO] - [train] Step 2900 out of 65536 | Loss --> 4.715 | Grad_l2 --> 0.622 | Weights_l2 --> 6924.007 | Lr --> 0.013 | Seconds_per_step --> 3.306 |
|
40 |
+
[2024-09-02 19:54:59,957][Main][INFO] - [train] Step 3000 out of 65536 | Loss --> 4.694 | Grad_l2 --> 0.652 | Weights_l2 --> 6922.709 | Lr --> 0.013 | Seconds_per_step --> 3.317 |
|
41 |
+
[2024-09-02 20:00:31,072][Main][INFO] - [train] Step 3100 out of 65536 | Loss --> 4.678 | Grad_l2 --> 0.614 | Weights_l2 --> 6921.561 | Lr --> 0.013 | Seconds_per_step --> 3.311 |
|
42 |
+
[2024-09-02 20:06:02,747][Main][INFO] - [train] Step 3200 out of 65536 | Loss --> 4.633 | Grad_l2 --> 0.610 | Weights_l2 --> 6920.463 | Lr --> 0.013 | Seconds_per_step --> 3.317 |
|
43 |
+
[2024-09-02 20:11:34,607][Main][INFO] - [train] Step 3300 out of 65536 | Loss --> 4.599 | Grad_l2 --> 0.638 | Weights_l2 --> 6919.642 | Lr --> 0.013 | Seconds_per_step --> 3.319 |
|
44 |
+
[2024-09-02 20:17:05,731][Main][INFO] - [train] Step 3400 out of 65536 | Loss --> 4.549 | Grad_l2 --> 0.774 | Weights_l2 --> 6919.263 | Lr --> 0.013 | Seconds_per_step --> 3.311 |
|
45 |
+
[2024-09-02 20:22:37,601][Main][INFO] - [train] Step 3500 out of 65536 | Loss --> 4.420 | Grad_l2 --> 0.934 | Weights_l2 --> 6918.974 | Lr --> 0.014 | Seconds_per_step --> 3.319 |
|
46 |
+
[2024-09-02 20:28:09,554][Main][INFO] - [train] Step 3600 out of 65536 | Loss --> 4.256 | Grad_l2 --> 0.763 | Weights_l2 --> 6919.477 | Lr --> 0.014 | Seconds_per_step --> 3.319 |
|
47 |
+
[2024-09-02 20:33:40,654][Main][INFO] - [train] Step 3700 out of 65536 | Loss --> 4.131 | Grad_l2 --> 0.657 | Weights_l2 --> 6920.705 | Lr --> 0.014 | Seconds_per_step --> 3.311 |
|
48 |
+
[2024-09-02 20:39:13,064][Main][INFO] - [train] Step 3800 out of 65536 | Loss --> 4.021 | Grad_l2 --> 0.709 | Weights_l2 --> 6922.188 | Lr --> 0.014 | Seconds_per_step --> 3.324 |
|
49 |
+
[2024-09-02 20:44:45,663][Main][INFO] - [train] Step 3900 out of 65536 | Loss --> 3.909 | Grad_l2 --> 0.637 | Weights_l2 --> 6923.666 | Lr --> 0.014 | Seconds_per_step --> 3.326 |
|
50 |
+
[2024-09-02 20:50:16,811][Main][INFO] - [train] Step 4000 out of 65536 | Loss --> 3.855 | Grad_l2 --> 1.013 | Weights_l2 --> 6923.778 | Lr --> 0.014 | Seconds_per_step --> 3.311 |
|
51 |
+
[2024-09-02 20:55:49,235][Main][INFO] - [train] Step 4100 out of 65536 | Loss --> 3.770 | Grad_l2 --> 0.589 | Weights_l2 --> 6925.545 | Lr --> 0.014 | Seconds_per_step --> 3.324 |
|
52 |
+
[2024-09-02 21:01:20,500][Main][INFO] - [train] Step 4200 out of 65536 | Loss --> 3.710 | Grad_l2 --> 0.579 | Weights_l2 --> 6927.200 | Lr --> 0.014 | Seconds_per_step --> 3.313 |
|
53 |
+
[2024-09-02 21:06:53,406][Main][INFO] - [train] Step 4300 out of 65536 | Loss --> 3.651 | Grad_l2 --> 0.588 | Weights_l2 --> 6928.842 | Lr --> 0.014 | Seconds_per_step --> 3.329 |
|
54 |
+
[2024-09-02 21:12:26,298][Main][INFO] - [train] Step 4400 out of 65536 | Loss --> 3.614 | Grad_l2 --> 0.632 | Weights_l2 --> 6930.597 | Lr --> 0.014 | Seconds_per_step --> 3.329 |
|
55 |
+
[2024-09-02 21:17:57,623][Main][INFO] - [train] Step 4500 out of 65536 | Loss --> 3.582 | Grad_l2 --> 0.884 | Weights_l2 --> 6931.569 | Lr --> 0.015 | Seconds_per_step --> 3.313 |
|
56 |
+
[2024-09-02 21:23:30,116][Main][INFO] - [train] Step 4600 out of 65536 | Loss --> 3.527 | Grad_l2 --> 0.582 | Weights_l2 --> 6933.783 | Lr --> 0.015 | Seconds_per_step --> 3.325 |
|
57 |
+
[2024-09-02 21:29:02,417][Main][INFO] - [train] Step 4700 out of 65536 | Loss --> 3.476 | Grad_l2 --> 0.549 | Weights_l2 --> 6935.959 | Lr --> 0.015 | Seconds_per_step --> 3.323 |
|
58 |
+
[2024-09-02 21:34:33,535][Main][INFO] - [train] Step 4800 out of 65536 | Loss --> 3.430 | Grad_l2 --> 0.551 | Weights_l2 --> 6938.224 | Lr --> 0.015 | Seconds_per_step --> 3.311 |
|
59 |
+
[2024-09-02 21:40:05,905][Main][INFO] - [train] Step 4900 out of 65536 | Loss --> 3.395 | Grad_l2 --> 0.550 | Weights_l2 --> 6940.617 | Lr --> 0.015 | Seconds_per_step --> 3.324 |
|
60 |
+
[2024-09-02 21:45:36,944][Main][INFO] - [train] Step 5000 out of 65536 | Loss --> 3.366 | Grad_l2 --> 0.546 | Weights_l2 --> 6943.230 | Lr --> 0.015 | Seconds_per_step --> 3.310 |
|
61 |
+
[2024-09-02 21:45:36,947][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-5000
|
62 |
+
[2024-09-02 21:45:36,954][accelerate.utils.other][WARNING] - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
63 |
+
[2024-09-02 21:45:44,182][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-5000/model.safetensors
|
64 |
+
[2024-09-02 21:45:54,822][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-5000/optimizer.bin
|
65 |
+
[2024-09-02 21:45:54,827][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-5000/scheduler.bin
|
66 |
+
[2024-09-02 21:45:54,828][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-5000/sampler.bin
|
67 |
+
[2024-09-02 21:45:54,829][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-5000/sampler_1.bin
|
68 |
+
[2024-09-02 21:45:54,835][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-5000/random_states_0.pkl
|
69 |
+
[2024-09-02 21:51:26,402][Main][INFO] - [train] Step 5100 out of 65536 | Loss --> 3.302 | Grad_l2 --> 0.541 | Weights_l2 --> 6946.278 | Lr --> 0.015 | Seconds_per_step --> 3.495 |
|
70 |
+
[2024-09-02 21:56:58,321][Main][INFO] - [train] Step 5200 out of 65536 | Loss --> 3.248 | Grad_l2 --> 0.556 | Weights_l2 --> 6950.060 | Lr --> 0.015 | Seconds_per_step --> 3.319 |
|
71 |
+
[2024-09-02 22:02:29,452][Main][INFO] - [train] Step 5300 out of 65536 | Loss --> 3.194 | Grad_l2 --> 0.566 | Weights_l2 --> 6954.461 | Lr --> 0.015 | Seconds_per_step --> 3.311 |
|
72 |
+
[2024-09-02 22:08:01,594][Main][INFO] - [train] Step 5400 out of 65536 | Loss --> 3.144 | Grad_l2 --> 0.548 | Weights_l2 --> 6959.061 | Lr --> 0.015 | Seconds_per_step --> 3.321 |
|
73 |
+
[2024-09-02 22:13:33,473][Main][INFO] - [train] Step 5500 out of 65536 | Loss --> 3.099 | Grad_l2 --> 0.546 | Weights_l2 --> 6963.676 | Lr --> 0.016 | Seconds_per_step --> 3.319 |
|
74 |
+
[2024-09-02 22:19:04,763][Main][INFO] - [train] Step 5600 out of 65536 | Loss --> 3.044 | Grad_l2 --> 0.531 | Weights_l2 --> 6968.055 | Lr --> 0.016 | Seconds_per_step --> 3.313 |
|
75 |
+
[2024-09-02 22:24:37,024][Main][INFO] - [train] Step 5700 out of 65536 | Loss --> 3.023 | Grad_l2 --> 0.528 | Weights_l2 --> 6972.595 | Lr --> 0.016 | Seconds_per_step --> 3.323 |
|
76 |
+
[2024-09-02 22:30:08,010][Main][INFO] - [train] Step 5800 out of 65536 | Loss --> 2.999 | Grad_l2 --> 0.529 | Weights_l2 --> 6977.095 | Lr --> 0.016 | Seconds_per_step --> 3.310 |
|
77 |
+
[2024-09-02 22:35:40,260][Main][INFO] - [train] Step 5900 out of 65536 | Loss --> 2.953 | Grad_l2 --> 0.516 | Weights_l2 --> 6981.522 | Lr --> 0.016 | Seconds_per_step --> 3.322 |
|
78 |
+
[2024-09-02 22:41:12,494][Main][INFO] - [train] Step 6000 out of 65536 | Loss --> 2.924 | Grad_l2 --> 0.514 | Weights_l2 --> 6985.860 | Lr --> 0.016 | Seconds_per_step --> 3.322 |
|
79 |
+
[2024-09-02 22:46:43,439][Main][INFO] - [train] Step 6100 out of 65536 | Loss --> 2.904 | Grad_l2 --> 0.500 | Weights_l2 --> 6990.209 | Lr --> 0.016 | Seconds_per_step --> 3.309 |
|
80 |
+
[2024-09-02 22:52:15,361][Main][INFO] - [train] Step 6200 out of 65536 | Loss --> 2.885 | Grad_l2 --> 0.499 | Weights_l2 --> 6994.575 | Lr --> 0.016 | Seconds_per_step --> 3.319 |
|
81 |
+
[2024-09-02 22:57:47,371][Main][INFO] - [train] Step 6300 out of 65536 | Loss --> 2.860 | Grad_l2 --> 0.496 | Weights_l2 --> 6998.855 | Lr --> 0.016 | Seconds_per_step --> 3.320 |
|
82 |
+
[2024-09-02 23:03:18,243][Main][INFO] - [train] Step 6400 out of 65536 | Loss --> 2.828 | Grad_l2 --> 0.486 | Weights_l2 --> 7003.354 | Lr --> 0.016 | Seconds_per_step --> 3.309 |
|
83 |
+
[2024-09-02 23:08:50,256][Main][INFO] - [train] Step 6500 out of 65536 | Loss --> 2.823 | Grad_l2 --> 0.491 | Weights_l2 --> 7007.772 | Lr --> 0.017 | Seconds_per_step --> 3.320 |
|
84 |
+
[2024-09-02 23:14:21,254][Main][INFO] - [train] Step 6600 out of 65536 | Loss --> 2.801 | Grad_l2 --> 0.572 | Weights_l2 --> 7012.034 | Lr --> 0.017 | Seconds_per_step --> 3.310 |
|
85 |
+
[2024-09-02 23:19:53,383][Main][INFO] - [train] Step 6700 out of 65536 | Loss --> 2.776 | Grad_l2 --> 0.473 | Weights_l2 --> 7016.624 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
|
86 |
+
[2024-09-02 23:25:25,894][Main][INFO] - [train] Step 6800 out of 65536 | Loss --> 2.764 | Grad_l2 --> 0.489 | Weights_l2 --> 7021.128 | Lr --> 0.017 | Seconds_per_step --> 3.325 |
|
87 |
+
[2024-09-02 23:30:56,990][Main][INFO] - [train] Step 6900 out of 65536 | Loss --> 2.754 | Grad_l2 --> 0.467 | Weights_l2 --> 7025.909 | Lr --> 0.017 | Seconds_per_step --> 3.311 |
|
88 |
+
[2024-09-02 23:36:28,837][Main][INFO] - [train] Step 7000 out of 65536 | Loss --> 2.716 | Grad_l2 --> 0.469 | Weights_l2 --> 7030.583 | Lr --> 0.017 | Seconds_per_step --> 3.318 |
|
89 |
+
[2024-09-02 23:42:00,897][Main][INFO] - [train] Step 7100 out of 65536 | Loss --> 2.706 | Grad_l2 --> 0.470 | Weights_l2 --> 7035.338 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
|
90 |
+
[2024-09-02 23:47:31,913][Main][INFO] - [train] Step 7200 out of 65536 | Loss --> 2.685 | Grad_l2 --> 0.460 | Weights_l2 --> 7040.107 | Lr --> 0.017 | Seconds_per_step --> 3.310 |
|
91 |
+
[2024-09-02 23:53:04,028][Main][INFO] - [train] Step 7300 out of 65536 | Loss --> 2.675 | Grad_l2 --> 0.462 | Weights_l2 --> 7044.921 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
|
92 |
+
[2024-09-02 23:58:35,224][Main][INFO] - [train] Step 7400 out of 65536 | Loss --> 2.670 | Grad_l2 --> 0.473 | Weights_l2 --> 7049.994 | Lr --> 0.017 | Seconds_per_step --> 3.312 |
|
93 |
+
[2024-09-03 00:04:07,495][Main][INFO] - [train] Step 7500 out of 65536 | Loss --> 2.653 | Grad_l2 --> 0.452 | Weights_l2 --> 7055.123 | Lr --> 0.018 | Seconds_per_step --> 3.323 |
|
94 |
+
[2024-09-03 00:09:39,687][Main][INFO] - [train] Step 7600 out of 65536 | Loss --> 2.644 | Grad_l2 --> 0.499 | Weights_l2 --> 7060.263 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
|
95 |
+
[2024-09-03 00:15:11,125][Main][INFO] - [train] Step 7700 out of 65536 | Loss --> 2.619 | Grad_l2 --> 0.451 | Weights_l2 --> 7065.593 | Lr --> 0.018 | Seconds_per_step --> 3.314 |
|
96 |
+
[2024-09-03 00:20:43,656][Main][INFO] - [train] Step 7800 out of 65536 | Loss --> 2.611 | Grad_l2 --> 0.444 | Weights_l2 --> 7071.016 | Lr --> 0.018 | Seconds_per_step --> 3.325 |
|
97 |
+
[2024-09-03 00:26:15,825][Main][INFO] - [train] Step 7900 out of 65536 | Loss --> 2.593 | Grad_l2 --> 0.444 | Weights_l2 --> 7076.338 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
|
98 |
+
[2024-09-03 00:31:46,986][Main][INFO] - [train] Step 8000 out of 65536 | Loss --> 2.591 | Grad_l2 --> 0.707 | Weights_l2 --> 7081.619 | Lr --> 0.018 | Seconds_per_step --> 3.312 |
|
99 |
+
[2024-09-03 00:37:19,240][Main][INFO] - [train] Step 8100 out of 65536 | Loss --> 2.583 | Grad_l2 --> 0.504 | Weights_l2 --> 7087.303 | Lr --> 0.018 | Seconds_per_step --> 3.323 |
|
100 |
+
[2024-09-03 00:42:50,497][Main][INFO] - [train] Step 8200 out of 65536 | Loss --> 2.572 | Grad_l2 --> 0.435 | Weights_l2 --> 7092.976 | Lr --> 0.018 | Seconds_per_step --> 3.313 |
|
101 |
+
[2024-09-03 00:48:22,669][Main][INFO] - [train] Step 8300 out of 65536 | Loss --> 2.550 | Grad_l2 --> 0.444 | Weights_l2 --> 7098.242 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
|
102 |
+
[2024-09-03 00:53:54,859][Main][INFO] - [train] Step 8400 out of 65536 | Loss --> 2.533 | Grad_l2 --> 0.424 | Weights_l2 --> 7103.870 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
|
103 |
+
[2024-09-03 00:59:25,959][Main][INFO] - [train] Step 8500 out of 65536 | Loss --> 2.520 | Grad_l2 --> 0.415 | Weights_l2 --> 7109.426 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
|
104 |
+
[2024-09-03 01:04:58,102][Main][INFO] - [train] Step 8600 out of 65536 | Loss --> 2.512 | Grad_l2 --> 0.445 | Weights_l2 --> 7115.243 | Lr --> 0.019 | Seconds_per_step --> 3.321 |
|
105 |
+
[2024-09-03 01:10:30,308][Main][INFO] - [train] Step 8700 out of 65536 | Loss --> 2.497 | Grad_l2 --> 0.416 | Weights_l2 --> 7120.917 | Lr --> 0.019 | Seconds_per_step --> 3.322 |
|
106 |
+
[2024-09-03 01:16:01,412][Main][INFO] - [train] Step 8800 out of 65536 | Loss --> 2.503 | Grad_l2 --> 0.453 | Weights_l2 --> 7127.067 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
|
107 |
+
[2024-09-03 01:21:33,679][Main][INFO] - [train] Step 8900 out of 65536 | Loss --> 2.498 | Grad_l2 --> 0.519 | Weights_l2 --> 7133.268 | Lr --> 0.019 | Seconds_per_step --> 3.323 |
|
108 |
+
[2024-09-03 01:27:05,633][Main][INFO] - [train] Step 9000 out of 65536 | Loss --> 2.480 | Grad_l2 --> 0.413 | Weights_l2 --> 7139.449 | Lr --> 0.019 | Seconds_per_step --> 3.320 |
|
109 |
+
[2024-09-03 01:32:36,839][Main][INFO] - [train] Step 9100 out of 65536 | Loss --> 2.488 | Grad_l2 --> 0.429 | Weights_l2 --> 7145.663 | Lr --> 0.019 | Seconds_per_step --> 3.312 |
|
110 |
+
[2024-09-03 01:38:09,090][Main][INFO] - [train] Step 9200 out of 65536 | Loss --> 2.458 | Grad_l2 --> 0.651 | Weights_l2 --> 7151.751 | Lr --> 0.019 | Seconds_per_step --> 3.322 |
|
111 |
+
[2024-09-03 01:43:40,183][Main][INFO] - [train] Step 9300 out of 65536 | Loss --> 2.481 | Grad_l2 --> 0.667 | Weights_l2 --> 7157.979 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
|
112 |
+
[2024-09-03 01:49:12,323][Main][INFO] - [train] Step 9400 out of 65536 | Loss --> 2.454 | Grad_l2 --> 0.500 | Weights_l2 --> 7164.722 | Lr --> 0.019 | Seconds_per_step --> 3.321 |
|
113 |
+
[2024-09-03 01:54:44,360][Main][INFO] - [train] Step 9500 out of 65536 | Loss --> 2.434 | Grad_l2 --> 0.434 | Weights_l2 --> 7171.100 | Lr --> 0.020 | Seconds_per_step --> 3.320 |
|
114 |
+
[2024-09-03 02:00:15,384][Main][INFO] - [train] Step 9600 out of 65536 | Loss --> 2.430 | Grad_l2 --> 0.459 | Weights_l2 --> 7177.669 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
|
115 |
+
[2024-09-03 02:05:47,653][Main][INFO] - [train] Step 9700 out of 65536 | Loss --> 2.435 | Grad_l2 --> 0.458 | Weights_l2 --> 7184.407 | Lr --> 0.020 | Seconds_per_step --> 3.323 |
|
116 |
+
[2024-09-03 02:11:19,839][Main][INFO] - [train] Step 9800 out of 65536 | Loss --> 2.431 | Grad_l2 --> 0.796 | Weights_l2 --> 7190.992 | Lr --> 0.020 | Seconds_per_step --> 3.322 |
|
117 |
+
[2024-09-03 02:16:50,929][Main][INFO] - [train] Step 9900 out of 65536 | Loss --> 2.403 | Grad_l2 --> 0.782 | Weights_l2 --> 7197.863 | Lr --> 0.020 | Seconds_per_step --> 3.311 |
|
118 |
+
[2024-09-03 02:22:23,236][Main][INFO] - [train] Step 10000 out of 65536 | Loss --> 2.445 | Grad_l2 --> 1.140 | Weights_l2 --> 7204.637 | Lr --> 0.020 | Seconds_per_step --> 3.323 |
|
119 |
+
[2024-09-03 02:22:23,238][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-10000
|
120 |
+
[2024-09-03 02:22:23,245][accelerate.utils.other][WARNING] - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
121 |
+
[2024-09-03 02:22:29,395][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-10000/model.safetensors
|
122 |
+
[2024-09-03 02:22:38,780][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-10000/optimizer.bin
|
123 |
+
[2024-09-03 02:22:38,784][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-10000/scheduler.bin
|
124 |
+
[2024-09-03 02:22:38,784][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-10000/sampler.bin
|
125 |
+
[2024-09-03 02:22:38,785][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-10000/sampler_1.bin
|
126 |
+
[2024-09-03 02:22:38,790][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-10000/random_states_0.pkl
|
127 |
+
[2024-09-03 02:28:09,713][Main][INFO] - [train] Step 10100 out of 65536 | Loss --> 2.441 | Grad_l2 --> 1.063 | Weights_l2 --> 7212.671 | Lr --> 0.020 | Seconds_per_step --> 3.465 |
|
128 |
+
[2024-09-03 02:33:42,096][Main][INFO] - [train] Step 10200 out of 65536 | Loss --> 2.421 | Grad_l2 --> 1.135 | Weights_l2 --> 7219.539 | Lr --> 0.020 | Seconds_per_step --> 3.324 |
|
129 |
+
[2024-09-03 02:39:14,331][Main][INFO] - [train] Step 10300 out of 65536 | Loss --> 2.408 | Grad_l2 --> 1.377 | Weights_l2 --> 7226.397 | Lr --> 0.020 | Seconds_per_step --> 3.322 |
|
130 |
+
[2024-09-03 02:44:45,309][Main][INFO] - [train] Step 10400 out of 65536 | Loss --> 2.385 | Grad_l2 --> 1.568 | Weights_l2 --> 7232.973 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
|
131 |
+
[2024-09-03 02:50:17,356][Main][INFO] - [train] Step 10500 out of 65536 | Loss --> 2.383 | Grad_l2 --> 5.267 | Weights_l2 --> 7238.788 | Lr --> 0.020 | Seconds_per_step --> 3.320 |
|
132 |
+
[2024-09-03 02:55:49,191][Main][INFO] - [train] Step 10600 out of 65536 | Loss --> 51.695 | Grad_l2 --> 2316.455 | Weights_l2 --> 7233.899 | Lr --> 0.020 | Seconds_per_step --> 3.318 |
|
133 |
+
[2024-09-03 03:01:20,350][Main][INFO] - [train] Step 10700 out of 65536 | Loss --> 19.189 | Grad_l2 --> 206.407 | Weights_l2 --> 7221.798 | Lr --> 0.020 | Seconds_per_step --> 3.312 |
|
134 |
+
[2024-09-03 03:06:52,743][Main][INFO] - [train] Step 10800 out of 65536 | Loss --> 6.908 | Grad_l2 --> 26.249 | Weights_l2 --> 7210.980 | Lr --> 0.020 | Seconds_per_step --> 3.324 |
|
135 |
+
[2024-09-03 03:12:23,733][Main][INFO] - [train] Step 10900 out of 65536 | Loss --> 42.736 | Grad_l2 --> 1292.659 | Weights_l2 --> 7206.464 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
|
checkpoints/wandb/debug-internal.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoints/wandb/debug.log
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Current SDK version is 0.17.8
|
2 |
+
2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Configure stats pid to 6499
|
3 |
+
2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
|
4 |
+
2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/settings
|
5 |
+
2024-09-02 17:03:04,742 INFO MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
|
6 |
+
2024-09-02 17:03:04,742 INFO MainThread:6499 [wandb_setup.py:_flush():77] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-09-02 17:03:04,742 WARNING MainThread:6499 [wandb_setup.py:_flush():77] Could not find program at -m nanoT5.main
|
8 |
+
2024-09-02 17:03:04,743 INFO MainThread:6499 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
|
9 |
+
2024-09-02 17:03:04,743 INFO MainThread:6499 [wandb_setup.py:_flush():77] Applying login settings: {}
|
10 |
+
2024-09-02 17:03:04,743 INFO MainThread:6499 [wandb_init.py:_log_setup():524] Logging user logs to /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/run-20240902_170304-v43qltex/logs/debug.log
|
11 |
+
2024-09-02 17:03:04,744 INFO MainThread:6499 [wandb_init.py:_log_setup():525] Logging internal logs to /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/run-20240902_170304-v43qltex/logs/debug-internal.log
|
12 |
+
2024-09-02 17:03:04,744 INFO MainThread:6499 [wandb_init.py:init():607] calling init triggers
|
13 |
+
2024-09-02 17:03:04,744 INFO MainThread:6499 [wandb_init.py:init():614] wandb.init called with sweep_config: {}
|
14 |
+
config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 2137, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'model': {'klass': 'custom_seq2seq', 'name': 'google/t5-v1_1-base', 'overwrite': None, 'add_config': None, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 8}, 'optim': {'name': 'adamwscale', 'base_lr': 0.02, 'batch_size': 64, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 10000, 'lr_scheduler': 'cosine', 'weight_decay': 0.001, 'grad_clip': 1.0, 'grad_acc': 4, 'final_cosine': 1e-05}, 'eval': {'every_steps': 100000, 'steps': 500}, 'checkpoint': {'every_steps': 5000}, 'logging': {'every_steps': 100, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nano-custom-seq2seq', 'entity': 'amazingvince', 'tags': ['nanoT5', 'my_tag'], 'mode': 'online'}}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/logs/2024-09-02/17-03-02'}
|
15 |
+
2024-09-02 17:03:04,745 INFO MainThread:6499 [wandb_init.py:init():657] starting backend
|
16 |
+
2024-09-02 17:03:04,745 INFO MainThread:6499 [wandb_init.py:init():661] setting up manager
|
17 |
+
2024-09-02 17:03:04,760 INFO MainThread:6499 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
18 |
+
2024-09-02 17:03:04,761 INFO MainThread:6499 [wandb_init.py:init():669] backend started and connected
|
19 |
+
2024-09-02 17:03:04,776 INFO MainThread:6499 [wandb_init.py:init():767] updated telemetry
|
20 |
+
2024-09-02 17:03:04,819 INFO MainThread:6499 [wandb_init.py:init():800] communicating run to backend with 90.0 second timeout
|
21 |
+
2024-09-02 17:03:05,519 INFO MainThread:6499 [wandb_init.py:init():851] starting run threads in backend
|
22 |
+
2024-09-02 17:03:05,817 INFO MainThread:6499 [wandb_run.py:_console_start():2463] atexit reg
|
23 |
+
2024-09-02 17:03:05,818 INFO MainThread:6499 [wandb_run.py:_redirect():2309] redirect: wrap_raw
|
24 |
+
2024-09-02 17:03:05,819 INFO MainThread:6499 [wandb_run.py:_redirect():2374] Wrapping output streams.
|
25 |
+
2024-09-02 17:03:05,819 INFO MainThread:6499 [wandb_run.py:_redirect():2399] Redirects installed.
|
26 |
+
2024-09-02 17:03:05,822 INFO MainThread:6499 [wandb_init.py:init():894] run started, returning control to user process
|
27 |
+
2024-09-02 17:03:35,512 INFO MainThread:6499 [wandb_run.py:_config_callback():1392] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 2137, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'model': {'klass': 'custom_seq2seq', 'name': 'google/t5-v1_1-base', 'overwrite': None, 'add_config': None, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 8, 'before_mask_input_length': 1137, 'target_length': 229}, 'optim': {'name': 'adamwscale', 'base_lr': 0.02, 'batch_size': 64, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 10000, 'lr_scheduler': 'cosine', 'weight_decay': 0.001, 'grad_clip': 1.0, 'grad_acc': 4, 'final_cosine': 1e-05}, 'eval': {'every_steps': 100000, 'steps': 500, 'corrected_steps': 500}, 'checkpoint': {'every_steps': 5000}, 'logging': {'every_steps': 100, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nano-custom-seq2seq', 'entity': 'amazingvince', 'tags': ['nanoT5', 'my_tag'], 'mode': 'online'}}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/logs/2024-09-02/17-03-02', 'n_all_param': 673076736}
|
28 |
+
2024-09-03 03:17:10,763 WARNING MsgRouterThr:6499 [router.py:message_loop():77] message_loop has been closed
|
checkpoints/wandb/run-20240902_170304-v43qltex/files/config.yaml
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
mode:
|
4 |
+
desc: null
|
5 |
+
value: pt
|
6 |
+
device:
|
7 |
+
desc: null
|
8 |
+
value: gpu
|
9 |
+
precision:
|
10 |
+
desc: null
|
11 |
+
value: bf16
|
12 |
+
eval_only:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
predict_only:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
seed:
|
19 |
+
desc: null
|
20 |
+
value: 2137
|
21 |
+
tokenizer:
|
22 |
+
desc: null
|
23 |
+
value:
|
24 |
+
name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
|
25 |
+
model:
|
26 |
+
desc: null
|
27 |
+
value:
|
28 |
+
klass: custom_seq2seq
|
29 |
+
name: google/t5-v1_1-base
|
30 |
+
overwrite: null
|
31 |
+
add_config: null
|
32 |
+
checkpoint_path: ''
|
33 |
+
random_init: true
|
34 |
+
compile: true
|
35 |
+
data:
|
36 |
+
desc: null
|
37 |
+
value:
|
38 |
+
input_length: 1024
|
39 |
+
mlm_probability: 0.15
|
40 |
+
mean_noise_span_length: 3.0
|
41 |
+
num_workers: 8
|
42 |
+
before_mask_input_length: 1137
|
43 |
+
target_length: 229
|
44 |
+
optim:
|
45 |
+
desc: null
|
46 |
+
value:
|
47 |
+
name: adamwscale
|
48 |
+
base_lr: 0.02
|
49 |
+
batch_size: 64
|
50 |
+
total_steps: 65536
|
51 |
+
epochs: -1
|
52 |
+
warmup_steps: 10000
|
53 |
+
lr_scheduler: cosine
|
54 |
+
weight_decay: 0.001
|
55 |
+
grad_clip: 1.0
|
56 |
+
grad_acc: 4
|
57 |
+
final_cosine: 1.0e-05
|
58 |
+
eval:
|
59 |
+
desc: null
|
60 |
+
value:
|
61 |
+
every_steps: 100000
|
62 |
+
steps: 500
|
63 |
+
corrected_steps: 500
|
64 |
+
checkpoint:
|
65 |
+
desc: null
|
66 |
+
value:
|
67 |
+
every_steps: 5000
|
68 |
+
logging:
|
69 |
+
desc: null
|
70 |
+
value:
|
71 |
+
every_steps: 100
|
72 |
+
grad_l2: true
|
73 |
+
weights_l2: true
|
74 |
+
use_wandb: true
|
75 |
+
wandb_config:
|
76 |
+
project: nano-custom-seq2seq
|
77 |
+
entity: amazingvince
|
78 |
+
tags:
|
79 |
+
- nanoT5
|
80 |
+
- my_tag
|
81 |
+
mode: online
|
82 |
+
slurm_id:
|
83 |
+
desc: null
|
84 |
+
value: none
|
85 |
+
working_dir:
|
86 |
+
desc: null
|
87 |
+
value: /workspace/nanoT5/logs/2024-09-02/17-03-02
|
88 |
+
_wandb:
|
89 |
+
desc: null
|
90 |
+
value:
|
91 |
+
python_version: 3.11.9
|
92 |
+
cli_version: 0.17.8
|
93 |
+
framework: huggingface
|
94 |
+
huggingface_version: 4.44.2
|
95 |
+
is_jupyter_run: false
|
96 |
+
is_kaggle_kernel: false
|
97 |
+
start_time: 1725296584
|
98 |
+
t:
|
99 |
+
1:
|
100 |
+
- 1
|
101 |
+
- 11
|
102 |
+
- 41
|
103 |
+
- 49
|
104 |
+
- 50
|
105 |
+
- 51
|
106 |
+
- 55
|
107 |
+
- 71
|
108 |
+
- 100
|
109 |
+
2:
|
110 |
+
- 1
|
111 |
+
- 11
|
112 |
+
- 41
|
113 |
+
- 49
|
114 |
+
- 50
|
115 |
+
- 51
|
116 |
+
- 55
|
117 |
+
- 71
|
118 |
+
- 100
|
119 |
+
3:
|
120 |
+
- 15
|
121 |
+
- 16
|
122 |
+
- 23
|
123 |
+
- 61
|
124 |
+
4: 3.11.9
|
125 |
+
5: 0.17.8
|
126 |
+
6: 4.44.2
|
127 |
+
8:
|
128 |
+
- 5
|
129 |
+
13: linux-x86_64
|
130 |
+
n_all_param:
|
131 |
+
desc: null
|
132 |
+
value: 673076736
|
checkpoints/wandb/run-20240902_170304-v43qltex/files/output.log
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/tokenizer.model
|
2 |
+
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/tokenizer.json
|
3 |
+
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/added_tokens.json
|
4 |
+
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/special_tokens_map.json
|
5 |
+
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/tokenizer_config.json
|
6 |
+
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
7 |
+
Resolving data files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 237.82it/s]
|
8 |
+
Resolving data files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 158787.76it/s]
|
9 |
+
Resolving data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 45.39it/s]
|
10 |
+
Resolving data files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 50142.87it/s]
|
11 |
+
Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 96827.44it/s]
|
12 |
+
Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 98644.87it/s]
|
13 |
+
|
14 |
+
|
15 |
+
Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:04<00:00, 223.35it/s]
|
16 |
+
Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:01<00:00, 908.08it/s]
|
17 |
+
Configuration saved in ./config.json
|
18 |
+
=========================================================================
|
19 |
+
Layer (type:depth-idx) Output Shape Param # Trainable
|
20 |
+
=========================================================================
|
21 |
+
CustomSeq2SeqLLM 673,076,736 True
|
22 |
+
Embedding 49,414,144 True
|
23 |
+
CustomEncoder 193,012,736 True
|
24 |
+
ModuleList 193,011,712 True
|
25 |
+
EncoderLayer 12,063,232 True
|
26 |
+
EncoderLayer 12,063,232 True
|
27 |
+
EncoderLayer 12,063,232 True
|
28 |
+
EncoderLayer 12,063,232 True
|
29 |
+
EncoderLayer 12,063,232 True
|
30 |
+
EncoderLayer 12,063,232 True
|
31 |
+
EncoderLayer 12,063,232 True
|
32 |
+
EncoderLayer 12,063,232 True
|
33 |
+
EncoderLayer 12,063,232 True
|
34 |
+
EncoderLayer 12,063,232 True
|
35 |
+
EncoderLayer 12,063,232 True
|
36 |
+
EncoderLayer 12,063,232 True
|
37 |
+
EncoderLayer 12,063,232 True
|
38 |
+
EncoderLayer 12,063,232 True
|
39 |
+
EncoderLayer 12,063,232 True
|
40 |
+
EncoderLayer 12,063,232 True
|
41 |
+
RMSNorm 1,024 True
|
42 |
+
CustomDecoder 430,649,856 True
|
43 |
+
ModuleList 430,648,832 True
|
44 |
+
DecoderLayer 14,688,256 True
|
45 |
+
DecoderLayer 12,063,232 True
|
46 |
+
DecoderLayer 14,688,256 True
|
47 |
+
DecoderLayer 12,063,232 True
|
48 |
+
DecoderLayer 14,688,256 True
|
49 |
+
DecoderLayer 12,063,232 True
|
50 |
+
DecoderLayer 14,688,256 True
|
51 |
+
DecoderLayer 12,063,232 True
|
52 |
+
DecoderLayer 14,688,256 True
|
53 |
+
DecoderLayer 12,063,232 True
|
54 |
+
DecoderLayer 14,688,256 True
|
55 |
+
DecoderLayer 12,063,232 True
|
56 |
+
DecoderLayer 14,688,256 True
|
57 |
+
DecoderLayer 12,063,232 True
|
58 |
+
DecoderLayer 14,688,256 True
|
59 |
+
DecoderLayer 12,063,232 True
|
60 |
+
DecoderLayer 14,688,256 True
|
61 |
+
DecoderLayer 12,063,232 True
|
62 |
+
DecoderLayer 14,688,256 True
|
63 |
+
DecoderLayer 12,063,232 True
|
64 |
+
DecoderLayer 14,688,256 True
|
65 |
+
DecoderLayer 12,063,232 True
|
66 |
+
DecoderLayer 14,688,256 True
|
67 |
+
DecoderLayer 12,063,232 True
|
68 |
+
DecoderLayer 14,688,256 True
|
69 |
+
DecoderLayer 12,063,232 True
|
70 |
+
DecoderLayer 14,688,256 True
|
71 |
+
DecoderLayer 12,063,232 True
|
72 |
+
DecoderLayer 14,688,256 True
|
73 |
+
DecoderLayer 12,063,232 True
|
74 |
+
DecoderLayer 14,688,256 True
|
75 |
+
DecoderLayer 14,688,256 True
|
76 |
+
RMSNorm 1,024 True
|
77 |
+
Linear 49,414,144 True
|
78 |
+
LigerCrossEntropyLoss -- False
|
79 |
+
=========================================================================
|
80 |
+
Total params: 673,076,736
|
81 |
+
Trainable params: 673,076,736
|
82 |
+
Non-trainable params: --
|
83 |
+
=========================================================================
|
84 |
+
W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] Graph break from `Tensor.item()`, consider setting:
|
85 |
+
W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] torch._dynamo.config.capture_scalar_outputs = True
|
86 |
+
W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] or:
|
87 |
+
W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
|
88 |
+
W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] to include these operations in the captured graph.
|
89 |
+
W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0]
|
90 |
+
[2024-09-02 17:14:53,691][Main][INFO] - [train] Step 100 out of 65536 | Loss --> 51.971 | Grad_l2 --> 82.676 | Weights_l2 --> 7042.062 | Lr --> 0.010 | Seconds_per_step --> 6.760 |
|
91 |
+
[2024-09-02 17:20:23,699][Main][INFO] - [train] Step 200 out of 65536 | Loss --> 14.150 | Grad_l2 --> 19.390 | Weights_l2 --> 7034.376 | Lr --> 0.010 | Seconds_per_step --> 3.300 |
|
92 |
+
[2024-09-02 17:25:54,840][Main][INFO] - [train] Step 300 out of 65536 | Loss --> 9.006 | Grad_l2 --> 9.061 | Weights_l2 --> 7026.824 | Lr --> 0.010 | Seconds_per_step --> 3.311 |
|
93 |
+
[2024-09-02 17:31:26,095][Main][INFO] - [train] Step 400 out of 65536 | Loss --> 7.529 | Grad_l2 --> 5.889 | Weights_l2 --> 7019.014 | Lr --> 0.010 | Seconds_per_step --> 3.313 |
|
94 |
+
[2024-09-02 17:36:56,190][Main][INFO] - [train] Step 500 out of 65536 | Loss --> 6.618 | Grad_l2 --> 4.039 | Weights_l2 --> 7010.897 | Lr --> 0.011 | Seconds_per_step --> 3.301 |
|
95 |
+
[2024-09-02 17:42:27,693][Main][INFO] - [train] Step 600 out of 65536 | Loss --> 5.994 | Grad_l2 --> 2.962 | Weights_l2 --> 7002.549 | Lr --> 0.011 | Seconds_per_step --> 3.315 |
|
96 |
+
[2024-09-02 17:47:57,967][Main][INFO] - [train] Step 700 out of 65536 | Loss --> 5.703 | Grad_l2 --> 2.434 | Weights_l2 --> 6994.267 | Lr --> 0.011 | Seconds_per_step --> 3.303 |
|
97 |
+
[2024-09-02 17:53:29,228][Main][INFO] - [train] Step 800 out of 65536 | Loss --> 6.603 | Grad_l2 --> 6.221 | Weights_l2 --> 6985.927 | Lr --> 0.011 | Seconds_per_step --> 3.313 |
|
98 |
+
[2024-09-02 17:59:00,011][Main][INFO] - [train] Step 900 out of 65536 | Loss --> 5.408 | Grad_l2 --> 1.465 | Weights_l2 --> 6980.026 | Lr --> 0.011 | Seconds_per_step --> 3.308 |
|
99 |
+
[2024-09-02 18:04:30,275][Main][INFO] - [train] Step 1000 out of 65536 | Loss --> 5.311 | Grad_l2 --> 0.992 | Weights_l2 --> 6975.109 | Lr --> 0.011 | Seconds_per_step --> 3.303 |
|
100 |
+
[2024-09-02 18:10:01,468][Main][INFO] - [train] Step 1100 out of 65536 | Loss --> 5.241 | Grad_l2 --> 0.854 | Weights_l2 --> 6970.708 | Lr --> 0.011 | Seconds_per_step --> 3.312 |
|
101 |
+
[2024-09-02 18:15:33,362][Main][INFO] - [train] Step 1200 out of 65536 | Loss --> 5.180 | Grad_l2 --> 0.838 | Weights_l2 --> 6966.641 | Lr --> 0.011 | Seconds_per_step --> 3.319 |
|
102 |
+
[2024-09-02 18:21:03,902][Main][INFO] - [train] Step 1300 out of 65536 | Loss --> 5.126 | Grad_l2 --> 0.764 | Weights_l2 --> 6962.789 | Lr --> 0.011 | Seconds_per_step --> 3.305 |
|
103 |
+
[2024-09-02 18:26:35,349][Main][INFO] - [train] Step 1400 out of 65536 | Loss --> 5.088 | Grad_l2 --> 0.744 | Weights_l2 --> 6959.146 | Lr --> 0.011 | Seconds_per_step --> 3.314 |
|
104 |
+
[2024-09-02 18:32:06,048][Main][INFO] - [train] Step 1500 out of 65536 | Loss --> 5.046 | Grad_l2 --> 0.702 | Weights_l2 --> 6955.673 | Lr --> 0.012 | Seconds_per_step --> 3.307 |
|
105 |
+
[2024-09-02 18:37:37,903][Main][INFO] - [train] Step 1600 out of 65536 | Loss --> 5.007 | Grad_l2 --> 0.691 | Weights_l2 --> 6952.523 | Lr --> 0.012 | Seconds_per_step --> 3.319 |
|
106 |
+
[2024-09-02 18:43:09,723][Main][INFO] - [train] Step 1700 out of 65536 | Loss --> 4.973 | Grad_l2 --> 0.673 | Weights_l2 --> 6949.412 | Lr --> 0.012 | Seconds_per_step --> 3.318 |
|
107 |
+
[2024-09-02 18:48:40,909][Main][INFO] - [train] Step 1800 out of 65536 | Loss --> 4.943 | Grad_l2 --> 0.671 | Weights_l2 --> 6946.498 | Lr --> 0.012 | Seconds_per_step --> 3.312 |
|
108 |
+
[2024-09-02 18:54:13,524][Main][INFO] - [train] Step 1900 out of 65536 | Loss --> 4.929 | Grad_l2 --> 0.668 | Weights_l2 --> 6943.795 | Lr --> 0.012 | Seconds_per_step --> 3.326 |
|
109 |
+
[2024-09-02 18:59:45,500][Main][INFO] - [train] Step 2000 out of 65536 | Loss --> 4.894 | Grad_l2 --> 0.665 | Weights_l2 --> 6941.241 | Lr --> 0.012 | Seconds_per_step --> 3.320 |
|
110 |
+
[2024-09-02 19:05:16,395][Main][INFO] - [train] Step 2100 out of 65536 | Loss --> 4.881 | Grad_l2 --> 0.713 | Weights_l2 --> 6938.861 | Lr --> 0.012 | Seconds_per_step --> 3.309 |
|
111 |
+
[2024-09-02 19:10:48,520][Main][INFO] - [train] Step 2200 out of 65536 | Loss --> 4.853 | Grad_l2 --> 0.653 | Weights_l2 --> 6936.551 | Lr --> 0.012 | Seconds_per_step --> 3.321 |
|
112 |
+
[2024-09-02 19:16:19,278][Main][INFO] - [train] Step 2300 out of 65536 | Loss --> 4.829 | Grad_l2 --> 0.646 | Weights_l2 --> 6934.357 | Lr --> 0.012 | Seconds_per_step --> 3.308 |
|
113 |
+
[2024-09-02 19:21:51,370][Main][INFO] - [train] Step 2400 out of 65536 | Loss --> 4.790 | Grad_l2 --> 0.620 | Weights_l2 --> 6932.338 | Lr --> 0.012 | Seconds_per_step --> 3.321 |
|
114 |
+
[2024-09-02 19:27:23,544][Main][INFO] - [train] Step 2500 out of 65536 | Loss --> 4.784 | Grad_l2 --> 0.643 | Weights_l2 --> 6930.395 | Lr --> 0.013 | Seconds_per_step --> 3.322 |
|
115 |
+
[2024-09-02 19:32:54,341][Main][INFO] - [train] Step 2600 out of 65536 | Loss --> 4.755 | Grad_l2 --> 0.623 | Weights_l2 --> 6928.543 | Lr --> 0.013 | Seconds_per_step --> 3.308 |
|
116 |
+
[2024-09-02 19:38:25,942][Main][INFO] - [train] Step 2700 out of 65536 | Loss --> 4.743 | Grad_l2 --> 0.636 | Weights_l2 --> 6926.944 | Lr --> 0.013 | Seconds_per_step --> 3.316 |
|
117 |
+
[2024-09-02 19:43:57,708][Main][INFO] - [train] Step 2800 out of 65536 | Loss --> 4.722 | Grad_l2 --> 0.590 | Weights_l2 --> 6925.379 | Lr --> 0.013 | Seconds_per_step --> 3.318 |
|
118 |
+
[2024-09-02 19:49:28,285][Main][INFO] - [train] Step 2900 out of 65536 | Loss --> 4.715 | Grad_l2 --> 0.622 | Weights_l2 --> 6924.007 | Lr --> 0.013 | Seconds_per_step --> 3.306 |
|
119 |
+
[2024-09-02 19:54:59,957][Main][INFO] - [train] Step 3000 out of 65536 | Loss --> 4.694 | Grad_l2 --> 0.652 | Weights_l2 --> 6922.709 | Lr --> 0.013 | Seconds_per_step --> 3.317 |
|
120 |
+
[2024-09-02 20:00:31,072][Main][INFO] - [train] Step 3100 out of 65536 | Loss --> 4.678 | Grad_l2 --> 0.614 | Weights_l2 --> 6921.561 | Lr --> 0.013 | Seconds_per_step --> 3.311 |
|
121 |
+
[2024-09-02 20:06:02,747][Main][INFO] - [train] Step 3200 out of 65536 | Loss --> 4.633 | Grad_l2 --> 0.610 | Weights_l2 --> 6920.463 | Lr --> 0.013 | Seconds_per_step --> 3.317 |
|
122 |
+
[2024-09-02 20:11:34,607][Main][INFO] - [train] Step 3300 out of 65536 | Loss --> 4.599 | Grad_l2 --> 0.638 | Weights_l2 --> 6919.642 | Lr --> 0.013 | Seconds_per_step --> 3.319 |
|
123 |
+
[2024-09-02 20:17:05,731][Main][INFO] - [train] Step 3400 out of 65536 | Loss --> 4.549 | Grad_l2 --> 0.774 | Weights_l2 --> 6919.263 | Lr --> 0.013 | Seconds_per_step --> 3.311 |
|
124 |
+
[2024-09-02 20:22:37,601][Main][INFO] - [train] Step 3500 out of 65536 | Loss --> 4.420 | Grad_l2 --> 0.934 | Weights_l2 --> 6918.974 | Lr --> 0.014 | Seconds_per_step --> 3.319 |
|
125 |
+
[2024-09-02 20:28:09,554][Main][INFO] - [train] Step 3600 out of 65536 | Loss --> 4.256 | Grad_l2 --> 0.763 | Weights_l2 --> 6919.477 | Lr --> 0.014 | Seconds_per_step --> 3.319 |
|
126 |
+
[2024-09-02 20:33:40,654][Main][INFO] - [train] Step 3700 out of 65536 | Loss --> 4.131 | Grad_l2 --> 0.657 | Weights_l2 --> 6920.705 | Lr --> 0.014 | Seconds_per_step --> 3.311 |
|
127 |
+
[2024-09-02 20:39:13,064][Main][INFO] - [train] Step 3800 out of 65536 | Loss --> 4.021 | Grad_l2 --> 0.709 | Weights_l2 --> 6922.188 | Lr --> 0.014 | Seconds_per_step --> 3.324 |
|
128 |
+
[2024-09-02 20:44:45,663][Main][INFO] - [train] Step 3900 out of 65536 | Loss --> 3.909 | Grad_l2 --> 0.637 | Weights_l2 --> 6923.666 | Lr --> 0.014 | Seconds_per_step --> 3.326 |
|
129 |
+
[2024-09-02 20:50:16,811][Main][INFO] - [train] Step 4000 out of 65536 | Loss --> 3.855 | Grad_l2 --> 1.013 | Weights_l2 --> 6923.778 | Lr --> 0.014 | Seconds_per_step --> 3.311 |
|
130 |
+
[2024-09-02 20:55:49,235][Main][INFO] - [train] Step 4100 out of 65536 | Loss --> 3.770 | Grad_l2 --> 0.589 | Weights_l2 --> 6925.545 | Lr --> 0.014 | Seconds_per_step --> 3.324 |
|
131 |
+
[2024-09-02 21:01:20,500][Main][INFO] - [train] Step 4200 out of 65536 | Loss --> 3.710 | Grad_l2 --> 0.579 | Weights_l2 --> 6927.200 | Lr --> 0.014 | Seconds_per_step --> 3.313 |
|
132 |
+
[2024-09-02 21:06:53,406][Main][INFO] - [train] Step 4300 out of 65536 | Loss --> 3.651 | Grad_l2 --> 0.588 | Weights_l2 --> 6928.842 | Lr --> 0.014 | Seconds_per_step --> 3.329 |
|
133 |
+
[2024-09-02 21:12:26,298][Main][INFO] - [train] Step 4400 out of 65536 | Loss --> 3.614 | Grad_l2 --> 0.632 | Weights_l2 --> 6930.597 | Lr --> 0.014 | Seconds_per_step --> 3.329 |
|
134 |
+
[2024-09-02 21:17:57,623][Main][INFO] - [train] Step 4500 out of 65536 | Loss --> 3.582 | Grad_l2 --> 0.884 | Weights_l2 --> 6931.569 | Lr --> 0.015 | Seconds_per_step --> 3.313 |
|
135 |
+
[2024-09-02 21:23:30,116][Main][INFO] - [train] Step 4600 out of 65536 | Loss --> 3.527 | Grad_l2 --> 0.582 | Weights_l2 --> 6933.783 | Lr --> 0.015 | Seconds_per_step --> 3.325 |
|
136 |
+
[2024-09-02 21:29:02,417][Main][INFO] - [train] Step 4700 out of 65536 | Loss --> 3.476 | Grad_l2 --> 0.549 | Weights_l2 --> 6935.959 | Lr --> 0.015 | Seconds_per_step --> 3.323 |
|
137 |
+
[2024-09-02 21:34:33,535][Main][INFO] - [train] Step 4800 out of 65536 | Loss --> 3.430 | Grad_l2 --> 0.551 | Weights_l2 --> 6938.224 | Lr --> 0.015 | Seconds_per_step --> 3.311 |
|
138 |
+
[2024-09-02 21:40:05,905][Main][INFO] - [train] Step 4900 out of 65536 | Loss --> 3.395 | Grad_l2 --> 0.550 | Weights_l2 --> 6940.617 | Lr --> 0.015 | Seconds_per_step --> 3.324 |
|
139 |
+
[2024-09-02 21:45:36,944][Main][INFO] - [train] Step 5000 out of 65536 | Loss --> 3.366 | Grad_l2 --> 0.546 | Weights_l2 --> 6943.230 | Lr --> 0.015 | Seconds_per_step --> 3.310 |
|
140 |
+
[2024-09-02 21:45:36,947][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-5000
|
141 |
+
[2024-09-02 21:45:36,954][accelerate.utils.other][WARNING] - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
142 |
+
[2024-09-02 21:45:44,182][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-5000/model.safetensors
|
143 |
+
[2024-09-02 21:45:54,822][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-5000/optimizer.bin
|
144 |
+
[2024-09-02 21:45:54,827][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-5000/scheduler.bin
|
145 |
+
[2024-09-02 21:45:54,828][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-5000/sampler.bin
|
146 |
+
[2024-09-02 21:45:54,829][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-5000/sampler_1.bin
|
147 |
+
[2024-09-02 21:45:54,835][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-5000/random_states_0.pkl
|
148 |
+
[2024-09-02 21:51:26,402][Main][INFO] - [train] Step 5100 out of 65536 | Loss --> 3.302 | Grad_l2 --> 0.541 | Weights_l2 --> 6946.278 | Lr --> 0.015 | Seconds_per_step --> 3.495 |
|
149 |
+
[2024-09-02 21:56:58,321][Main][INFO] - [train] Step 5200 out of 65536 | Loss --> 3.248 | Grad_l2 --> 0.556 | Weights_l2 --> 6950.060 | Lr --> 0.015 | Seconds_per_step --> 3.319 |
|
150 |
+
[2024-09-02 22:02:29,452][Main][INFO] - [train] Step 5300 out of 65536 | Loss --> 3.194 | Grad_l2 --> 0.566 | Weights_l2 --> 6954.461 | Lr --> 0.015 | Seconds_per_step --> 3.311 |
|
151 |
+
[2024-09-02 22:08:01,594][Main][INFO] - [train] Step 5400 out of 65536 | Loss --> 3.144 | Grad_l2 --> 0.548 | Weights_l2 --> 6959.061 | Lr --> 0.015 | Seconds_per_step --> 3.321 |
|
152 |
+
[2024-09-02 22:13:33,473][Main][INFO] - [train] Step 5500 out of 65536 | Loss --> 3.099 | Grad_l2 --> 0.546 | Weights_l2 --> 6963.676 | Lr --> 0.016 | Seconds_per_step --> 3.319 |
|
153 |
+
[2024-09-02 22:19:04,763][Main][INFO] - [train] Step 5600 out of 65536 | Loss --> 3.044 | Grad_l2 --> 0.531 | Weights_l2 --> 6968.055 | Lr --> 0.016 | Seconds_per_step --> 3.313 |
|
154 |
+
[2024-09-02 22:24:37,024][Main][INFO] - [train] Step 5700 out of 65536 | Loss --> 3.023 | Grad_l2 --> 0.528 | Weights_l2 --> 6972.595 | Lr --> 0.016 | Seconds_per_step --> 3.323 |
|
155 |
+
[2024-09-02 22:30:08,010][Main][INFO] - [train] Step 5800 out of 65536 | Loss --> 2.999 | Grad_l2 --> 0.529 | Weights_l2 --> 6977.095 | Lr --> 0.016 | Seconds_per_step --> 3.310 |
|
156 |
+
[2024-09-02 22:35:40,260][Main][INFO] - [train] Step 5900 out of 65536 | Loss --> 2.953 | Grad_l2 --> 0.516 | Weights_l2 --> 6981.522 | Lr --> 0.016 | Seconds_per_step --> 3.322 |
|
157 |
+
[2024-09-02 22:41:12,494][Main][INFO] - [train] Step 6000 out of 65536 | Loss --> 2.924 | Grad_l2 --> 0.514 | Weights_l2 --> 6985.860 | Lr --> 0.016 | Seconds_per_step --> 3.322 |
|
158 |
+
[2024-09-02 22:46:43,439][Main][INFO] - [train] Step 6100 out of 65536 | Loss --> 2.904 | Grad_l2 --> 0.500 | Weights_l2 --> 6990.209 | Lr --> 0.016 | Seconds_per_step --> 3.309 |
|
159 |
+
[2024-09-02 22:52:15,361][Main][INFO] - [train] Step 6200 out of 65536 | Loss --> 2.885 | Grad_l2 --> 0.499 | Weights_l2 --> 6994.575 | Lr --> 0.016 | Seconds_per_step --> 3.319 |
|
160 |
+
[2024-09-02 22:57:47,371][Main][INFO] - [train] Step 6300 out of 65536 | Loss --> 2.860 | Grad_l2 --> 0.496 | Weights_l2 --> 6998.855 | Lr --> 0.016 | Seconds_per_step --> 3.320 |
|
161 |
+
[2024-09-02 23:03:18,243][Main][INFO] - [train] Step 6400 out of 65536 | Loss --> 2.828 | Grad_l2 --> 0.486 | Weights_l2 --> 7003.354 | Lr --> 0.016 | Seconds_per_step --> 3.309 |
|
162 |
+
[2024-09-02 23:08:50,256][Main][INFO] - [train] Step 6500 out of 65536 | Loss --> 2.823 | Grad_l2 --> 0.491 | Weights_l2 --> 7007.772 | Lr --> 0.017 | Seconds_per_step --> 3.320 |
|
163 |
+
[2024-09-02 23:14:21,254][Main][INFO] - [train] Step 6600 out of 65536 | Loss --> 2.801 | Grad_l2 --> 0.572 | Weights_l2 --> 7012.034 | Lr --> 0.017 | Seconds_per_step --> 3.310 |
|
164 |
+
[2024-09-02 23:19:53,383][Main][INFO] - [train] Step 6700 out of 65536 | Loss --> 2.776 | Grad_l2 --> 0.473 | Weights_l2 --> 7016.624 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
|
165 |
+
[2024-09-02 23:25:25,894][Main][INFO] - [train] Step 6800 out of 65536 | Loss --> 2.764 | Grad_l2 --> 0.489 | Weights_l2 --> 7021.128 | Lr --> 0.017 | Seconds_per_step --> 3.325 |
|
166 |
+
[2024-09-02 23:30:56,990][Main][INFO] - [train] Step 6900 out of 65536 | Loss --> 2.754 | Grad_l2 --> 0.467 | Weights_l2 --> 7025.909 | Lr --> 0.017 | Seconds_per_step --> 3.311 |
|
167 |
+
[2024-09-02 23:36:28,837][Main][INFO] - [train] Step 7000 out of 65536 | Loss --> 2.716 | Grad_l2 --> 0.469 | Weights_l2 --> 7030.583 | Lr --> 0.017 | Seconds_per_step --> 3.318 |
|
168 |
+
[2024-09-02 23:42:00,897][Main][INFO] - [train] Step 7100 out of 65536 | Loss --> 2.706 | Grad_l2 --> 0.470 | Weights_l2 --> 7035.338 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
|
169 |
+
[2024-09-02 23:47:31,913][Main][INFO] - [train] Step 7200 out of 65536 | Loss --> 2.685 | Grad_l2 --> 0.460 | Weights_l2 --> 7040.107 | Lr --> 0.017 | Seconds_per_step --> 3.310 |
|
170 |
+
[2024-09-02 23:53:04,028][Main][INFO] - [train] Step 7300 out of 65536 | Loss --> 2.675 | Grad_l2 --> 0.462 | Weights_l2 --> 7044.921 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
|
171 |
+
[2024-09-02 23:58:35,224][Main][INFO] - [train] Step 7400 out of 65536 | Loss --> 2.670 | Grad_l2 --> 0.473 | Weights_l2 --> 7049.994 | Lr --> 0.017 | Seconds_per_step --> 3.312 |
|
172 |
+
[2024-09-03 00:04:07,495][Main][INFO] - [train] Step 7500 out of 65536 | Loss --> 2.653 | Grad_l2 --> 0.452 | Weights_l2 --> 7055.123 | Lr --> 0.018 | Seconds_per_step --> 3.323 |
|
173 |
+
[2024-09-03 00:09:39,687][Main][INFO] - [train] Step 7600 out of 65536 | Loss --> 2.644 | Grad_l2 --> 0.499 | Weights_l2 --> 7060.263 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
|
174 |
+
[2024-09-03 00:15:11,125][Main][INFO] - [train] Step 7700 out of 65536 | Loss --> 2.619 | Grad_l2 --> 0.451 | Weights_l2 --> 7065.593 | Lr --> 0.018 | Seconds_per_step --> 3.314 |
|
175 |
+
[2024-09-03 00:20:43,656][Main][INFO] - [train] Step 7800 out of 65536 | Loss --> 2.611 | Grad_l2 --> 0.444 | Weights_l2 --> 7071.016 | Lr --> 0.018 | Seconds_per_step --> 3.325 |
|
176 |
+
[2024-09-03 00:26:15,825][Main][INFO] - [train] Step 7900 out of 65536 | Loss --> 2.593 | Grad_l2 --> 0.444 | Weights_l2 --> 7076.338 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
|
177 |
+
[2024-09-03 00:31:46,986][Main][INFO] - [train] Step 8000 out of 65536 | Loss --> 2.591 | Grad_l2 --> 0.707 | Weights_l2 --> 7081.619 | Lr --> 0.018 | Seconds_per_step --> 3.312 |
|
178 |
+
[2024-09-03 00:37:19,240][Main][INFO] - [train] Step 8100 out of 65536 | Loss --> 2.583 | Grad_l2 --> 0.504 | Weights_l2 --> 7087.303 | Lr --> 0.018 | Seconds_per_step --> 3.323 |
|
179 |
+
[2024-09-03 00:42:50,497][Main][INFO] - [train] Step 8200 out of 65536 | Loss --> 2.572 | Grad_l2 --> 0.435 | Weights_l2 --> 7092.976 | Lr --> 0.018 | Seconds_per_step --> 3.313 |
|
180 |
+
[2024-09-03 00:48:22,669][Main][INFO] - [train] Step 8300 out of 65536 | Loss --> 2.550 | Grad_l2 --> 0.444 | Weights_l2 --> 7098.242 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
|
181 |
+
[2024-09-03 00:53:54,859][Main][INFO] - [train] Step 8400 out of 65536 | Loss --> 2.533 | Grad_l2 --> 0.424 | Weights_l2 --> 7103.870 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
|
182 |
+
[2024-09-03 00:59:25,959][Main][INFO] - [train] Step 8500 out of 65536 | Loss --> 2.520 | Grad_l2 --> 0.415 | Weights_l2 --> 7109.426 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
|
183 |
+
[2024-09-03 01:04:58,102][Main][INFO] - [train] Step 8600 out of 65536 | Loss --> 2.512 | Grad_l2 --> 0.445 | Weights_l2 --> 7115.243 | Lr --> 0.019 | Seconds_per_step --> 3.321 |
|
184 |
+
[2024-09-03 01:10:30,308][Main][INFO] - [train] Step 8700 out of 65536 | Loss --> 2.497 | Grad_l2 --> 0.416 | Weights_l2 --> 7120.917 | Lr --> 0.019 | Seconds_per_step --> 3.322 |
|
185 |
+
[2024-09-03 01:16:01,412][Main][INFO] - [train] Step 8800 out of 65536 | Loss --> 2.503 | Grad_l2 --> 0.453 | Weights_l2 --> 7127.067 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
|
186 |
+
[2024-09-03 01:21:33,679][Main][INFO] - [train] Step 8900 out of 65536 | Loss --> 2.498 | Grad_l2 --> 0.519 | Weights_l2 --> 7133.268 | Lr --> 0.019 | Seconds_per_step --> 3.323 |
|
187 |
+
[2024-09-03 01:27:05,633][Main][INFO] - [train] Step 9000 out of 65536 | Loss --> 2.480 | Grad_l2 --> 0.413 | Weights_l2 --> 7139.449 | Lr --> 0.019 | Seconds_per_step --> 3.320 |
|
188 |
+
[2024-09-03 01:32:36,839][Main][INFO] - [train] Step 9100 out of 65536 | Loss --> 2.488 | Grad_l2 --> 0.429 | Weights_l2 --> 7145.663 | Lr --> 0.019 | Seconds_per_step --> 3.312 |
|
189 |
+
[2024-09-03 01:38:09,090][Main][INFO] - [train] Step 9200 out of 65536 | Loss --> 2.458 | Grad_l2 --> 0.651 | Weights_l2 --> 7151.751 | Lr --> 0.019 | Seconds_per_step --> 3.322 |
|
190 |
+
[2024-09-03 01:43:40,183][Main][INFO] - [train] Step 9300 out of 65536 | Loss --> 2.481 | Grad_l2 --> 0.667 | Weights_l2 --> 7157.979 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
|
191 |
+
[2024-09-03 01:49:12,323][Main][INFO] - [train] Step 9400 out of 65536 | Loss --> 2.454 | Grad_l2 --> 0.500 | Weights_l2 --> 7164.722 | Lr --> 0.019 | Seconds_per_step --> 3.321 |
|
192 |
+
[2024-09-03 01:54:44,360][Main][INFO] - [train] Step 9500 out of 65536 | Loss --> 2.434 | Grad_l2 --> 0.434 | Weights_l2 --> 7171.100 | Lr --> 0.020 | Seconds_per_step --> 3.320 |
|
193 |
+
[2024-09-03 02:00:15,384][Main][INFO] - [train] Step 9600 out of 65536 | Loss --> 2.430 | Grad_l2 --> 0.459 | Weights_l2 --> 7177.669 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
|
194 |
+
[2024-09-03 02:05:47,653][Main][INFO] - [train] Step 9700 out of 65536 | Loss --> 2.435 | Grad_l2 --> 0.458 | Weights_l2 --> 7184.407 | Lr --> 0.020 | Seconds_per_step --> 3.323 |
|
195 |
+
[2024-09-03 02:11:19,839][Main][INFO] - [train] Step 9800 out of 65536 | Loss --> 2.431 | Grad_l2 --> 0.796 | Weights_l2 --> 7190.992 | Lr --> 0.020 | Seconds_per_step --> 3.322 |
|
196 |
+
[2024-09-03 02:16:50,929][Main][INFO] - [train] Step 9900 out of 65536 | Loss --> 2.403 | Grad_l2 --> 0.782 | Weights_l2 --> 7197.863 | Lr --> 0.020 | Seconds_per_step --> 3.311 |
|
197 |
+
[2024-09-03 02:22:23,236][Main][INFO] - [train] Step 10000 out of 65536 | Loss --> 2.445 | Grad_l2 --> 1.140 | Weights_l2 --> 7204.637 | Lr --> 0.020 | Seconds_per_step --> 3.323 |
|
198 |
+
[2024-09-03 02:22:23,238][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-10000
|
199 |
+
[2024-09-03 02:22:23,245][accelerate.utils.other][WARNING] - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
200 |
+
[2024-09-03 02:22:29,395][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-10000/model.safetensors
|
201 |
+
[2024-09-03 02:22:38,780][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-10000/optimizer.bin
|
202 |
+
[2024-09-03 02:22:38,784][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-10000/scheduler.bin
|
203 |
+
[2024-09-03 02:22:38,784][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-10000/sampler.bin
|
204 |
+
[2024-09-03 02:22:38,785][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-10000/sampler_1.bin
|
205 |
+
[2024-09-03 02:22:38,790][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-10000/random_states_0.pkl
|
206 |
+
[2024-09-03 02:28:09,713][Main][INFO] - [train] Step 10100 out of 65536 | Loss --> 2.441 | Grad_l2 --> 1.063 | Weights_l2 --> 7212.671 | Lr --> 0.020 | Seconds_per_step --> 3.465 |
|
207 |
+
[2024-09-03 02:33:42,096][Main][INFO] - [train] Step 10200 out of 65536 | Loss --> 2.421 | Grad_l2 --> 1.135 | Weights_l2 --> 7219.539 | Lr --> 0.020 | Seconds_per_step --> 3.324 |
|
208 |
+
[2024-09-03 02:39:14,331][Main][INFO] - [train] Step 10300 out of 65536 | Loss --> 2.408 | Grad_l2 --> 1.377 | Weights_l2 --> 7226.397 | Lr --> 0.020 | Seconds_per_step --> 3.322 |
|
209 |
+
[2024-09-03 02:44:45,309][Main][INFO] - [train] Step 10400 out of 65536 | Loss --> 2.385 | Grad_l2 --> 1.568 | Weights_l2 --> 7232.973 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
|
210 |
+
[2024-09-03 02:50:17,356][Main][INFO] - [train] Step 10500 out of 65536 | Loss --> 2.383 | Grad_l2 --> 5.267 | Weights_l2 --> 7238.788 | Lr --> 0.020 | Seconds_per_step --> 3.320 |
|
211 |
+
[2024-09-03 02:55:49,191][Main][INFO] - [train] Step 10600 out of 65536 | Loss --> 51.695 | Grad_l2 --> 2316.455 | Weights_l2 --> 7233.899 | Lr --> 0.020 | Seconds_per_step --> 3.318 |
|
212 |
+
[2024-09-03 03:01:20,350][Main][INFO] - [train] Step 10700 out of 65536 | Loss --> 19.189 | Grad_l2 --> 206.407 | Weights_l2 --> 7221.798 | Lr --> 0.020 | Seconds_per_step --> 3.312 |
|
213 |
+
[2024-09-03 03:06:52,743][Main][INFO] - [train] Step 10800 out of 65536 | Loss --> 6.908 | Grad_l2 --> 26.249 | Weights_l2 --> 7210.980 | Lr --> 0.020 | Seconds_per_step --> 3.324 |
|
214 |
+
[2024-09-03 03:12:23,733][Main][INFO] - [train] Step 10900 out of 65536 | Loss --> 42.736 | Grad_l2 --> 1292.659 | Weights_l2 --> 7206.464 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
|
215 |
+
Traceback (most recent call last):
|
216 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
217 |
+
File "<frozen runpy>", line 88, in _run_code
|
218 |
+
File "/workspace/nanoT5/nanoT5/main.py", line 92, in <module>
|
219 |
+
main()
|
220 |
+
File "/usr/local/lib/python3.11/dist-packages/hydra/main.py", line 94, in decorated_main
|
221 |
+
_run_hydra(
|
222 |
+
File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 394, in _run_hydra
|
223 |
+
_run_app(
|
224 |
+
File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 457, in _run_app
|
225 |
+
run_and_report(
|
226 |
+
File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 220, in run_and_report
|
227 |
+
return func()
|
228 |
+
^^^^^^
|
229 |
+
File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 458, in <lambda>
|
230 |
+
lambda: hydra.run(
|
231 |
+
^^^^^^^^^^
|
232 |
+
File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/hydra.py", line 119, in run
|
233 |
+
ret = run_job(
|
234 |
+
^^^^^^^^
|
235 |
+
File "/usr/local/lib/python3.11/dist-packages/hydra/core/utils.py", line 186, in run_job
|
236 |
+
ret.return_value = task_function(task_cfg)
|
237 |
+
^^^^^^^^^^^^^^^^^^^^^^^
|
238 |
+
File "/workspace/nanoT5/nanoT5/main.py", line 75, in main
|
239 |
+
train(
|
240 |
+
File "/workspace/nanoT5/nanoT5/utils/train_utils.py", line 197, in train
|
241 |
+
for batch_id, batch in enumerate(train_dataloader, start=1):
|
242 |
+
File "/usr/local/lib/python3.11/dist-packages/accelerate/data_loader.py", line 685, in __iter__
|
243 |
+
batch = send_to_device(batch, self.state.device, non_blocking=self._non_blocking)
|
244 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
245 |
+
File "/usr/local/lib/python3.11/dist-packages/accelerate/utils/operations.py", line 183, in send_to_device
|
246 |
+
{
|
247 |
+
File "/usr/local/lib/python3.11/dist-packages/accelerate/utils/operations.py", line 184, in <dictcomp>
|
248 |
+
k: t if k in skip_keys else send_to_device(t, device, non_blocking=non_blocking, skip_keys=skip_keys)
|
249 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
250 |
+
File "/usr/local/lib/python3.11/dist-packages/accelerate/utils/operations.py", line 155, in send_to_device
|
251 |
+
return tensor.to(device, non_blocking=non_blocking)
|
252 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
253 |
+
KeyboardInterrupt
|
checkpoints/wandb/run-20240902_170304-v43qltex/files/requirements.txt
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GitPython==3.1.43
|
2 |
+
Jinja2==3.1.4
|
3 |
+
MarkupSafe==2.1.5
|
4 |
+
PyGObject==3.42.1
|
5 |
+
PyJWT==2.3.0
|
6 |
+
PyYAML==6.0.2
|
7 |
+
Pygments==2.18.0
|
8 |
+
SecretStorage==3.3.1
|
9 |
+
Send2Trash==1.8.3
|
10 |
+
absl-py==2.1.0
|
11 |
+
accelerate==0.33.0
|
12 |
+
aiohappyeyeballs==2.4.0
|
13 |
+
aiohttp==3.10.5
|
14 |
+
aiosignal==1.3.1
|
15 |
+
antlr4-python3-runtime==4.9.3
|
16 |
+
anyio==4.4.0
|
17 |
+
argon2-cffi-bindings==21.2.0
|
18 |
+
argon2-cffi==23.1.0
|
19 |
+
arrow==1.3.0
|
20 |
+
asttokens==2.4.1
|
21 |
+
async-lru==2.0.4
|
22 |
+
attrs==24.2.0
|
23 |
+
babel==2.16.0
|
24 |
+
beautifulsoup4==4.12.3
|
25 |
+
bleach==6.1.0
|
26 |
+
blinker==1.4
|
27 |
+
certifi==2024.7.4
|
28 |
+
cffi==1.17.0
|
29 |
+
charset-normalizer==3.3.2
|
30 |
+
click==8.1.7
|
31 |
+
comm==0.2.2
|
32 |
+
cryptography==3.4.8
|
33 |
+
datasets==2.21.0
|
34 |
+
dbus-python==1.2.18
|
35 |
+
debugpy==1.8.5
|
36 |
+
decorator==5.1.1
|
37 |
+
defusedxml==0.7.1
|
38 |
+
dill==0.3.8
|
39 |
+
distro==1.7.0
|
40 |
+
docker-pycreds==0.4.0
|
41 |
+
einops==0.8.0
|
42 |
+
entrypoints==0.4
|
43 |
+
evaluate==0.4.2
|
44 |
+
executing==2.0.1
|
45 |
+
fancycompleter==0.9.1
|
46 |
+
fastjsonschema==2.20.0
|
47 |
+
filelock==3.15.4
|
48 |
+
flash-attn==2.6.3
|
49 |
+
fqdn==1.5.1
|
50 |
+
frozenlist==1.4.1
|
51 |
+
fsspec==2024.6.1
|
52 |
+
gitdb==4.0.11
|
53 |
+
h11==0.14.0
|
54 |
+
httpcore==1.0.5
|
55 |
+
httplib2==0.20.2
|
56 |
+
httpx==0.27.0
|
57 |
+
huggingface-hub==0.24.6
|
58 |
+
hydra-core==1.3.2
|
59 |
+
idna==3.7
|
60 |
+
importlib-metadata==4.6.4
|
61 |
+
ipykernel==6.29.5
|
62 |
+
ipython-genutils==0.2.0
|
63 |
+
ipython==8.26.0
|
64 |
+
ipywidgets==8.1.3
|
65 |
+
isoduration==20.11.0
|
66 |
+
jedi==0.19.1
|
67 |
+
jeepney==0.7.1
|
68 |
+
joblib==1.4.2
|
69 |
+
json5==0.9.25
|
70 |
+
jsonpointer==3.0.0
|
71 |
+
jsonschema-specifications==2023.12.1
|
72 |
+
jsonschema==4.23.0
|
73 |
+
jupyter-archive==3.4.0
|
74 |
+
jupyter-events==0.10.0
|
75 |
+
jupyter-highlight-selected-word==0.2.0
|
76 |
+
jupyter-lsp==2.2.5
|
77 |
+
jupyter_client==7.4.9
|
78 |
+
jupyter_contrib_core==0.4.2
|
79 |
+
jupyter_contrib_nbextensions==0.7.0
|
80 |
+
jupyter_core==5.7.2
|
81 |
+
jupyter_nbextensions_configurator==0.6.4
|
82 |
+
jupyter_server==2.14.2
|
83 |
+
jupyter_server_terminals==0.5.3
|
84 |
+
jupyterlab==4.2.4
|
85 |
+
jupyterlab_pygments==0.3.0
|
86 |
+
jupyterlab_server==2.27.3
|
87 |
+
jupyterlab_widgets==3.0.11
|
88 |
+
keyring==23.5.0
|
89 |
+
launchpadlib==1.10.16
|
90 |
+
lazr.restfulclient==0.14.4
|
91 |
+
lazr.uri==1.0.6
|
92 |
+
liger-kernel==0.2.1
|
93 |
+
lxml==5.3.0
|
94 |
+
matplotlib-inline==0.1.7
|
95 |
+
mistune==3.0.2
|
96 |
+
more-itertools==8.10.0
|
97 |
+
mpmath==1.3.0
|
98 |
+
multidict==6.0.5
|
99 |
+
multiprocess==0.70.16
|
100 |
+
nbclassic==1.1.0
|
101 |
+
nbclient==0.10.0
|
102 |
+
nbconvert==7.16.4
|
103 |
+
nbformat==5.10.4
|
104 |
+
nest-asyncio==1.6.0
|
105 |
+
networkx==3.3
|
106 |
+
nltk==3.9.1
|
107 |
+
notebook==6.5.5
|
108 |
+
notebook_shim==0.2.4
|
109 |
+
numpy==1.26.4
|
110 |
+
nvidia-cublas-cu12==12.1.3.1
|
111 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
112 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
113 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
114 |
+
nvidia-cudnn-cu12==9.1.0.70
|
115 |
+
nvidia-cufft-cu12==11.0.2.54
|
116 |
+
nvidia-curand-cu12==10.3.2.106
|
117 |
+
nvidia-cusolver-cu12==11.4.5.107
|
118 |
+
nvidia-cusparse-cu12==12.1.0.106
|
119 |
+
nvidia-nccl-cu12==2.20.5
|
120 |
+
nvidia-nvjitlink-cu12==12.6.20
|
121 |
+
nvidia-nvtx-cu12==12.1.105
|
122 |
+
oauthlib==3.2.0
|
123 |
+
omegaconf==2.3.0
|
124 |
+
overrides==7.7.0
|
125 |
+
packaging==24.1
|
126 |
+
pandas==2.2.2
|
127 |
+
pandocfilters==1.5.1
|
128 |
+
parso==0.8.4
|
129 |
+
pdbpp==0.10.3
|
130 |
+
pexpect==4.9.0
|
131 |
+
pillow==10.4.0
|
132 |
+
pip==24.2
|
133 |
+
platformdirs==4.2.2
|
134 |
+
prometheus_client==0.20.0
|
135 |
+
prompt_toolkit==3.0.47
|
136 |
+
protobuf==3.20.3
|
137 |
+
psutil==6.0.0
|
138 |
+
ptyprocess==0.7.0
|
139 |
+
pure_eval==0.2.3
|
140 |
+
pyarrow==17.0.0
|
141 |
+
pycparser==2.22
|
142 |
+
pynvml==11.5.3
|
143 |
+
pyparsing==2.4.7
|
144 |
+
pyrepl==0.9.0
|
145 |
+
python-apt==2.4.0+ubuntu3
|
146 |
+
python-dateutil==2.9.0.post0
|
147 |
+
python-json-logger==2.0.7
|
148 |
+
pytz==2024.1
|
149 |
+
pyzmq==24.0.1
|
150 |
+
referencing==0.35.1
|
151 |
+
regex==2024.7.24
|
152 |
+
requests==2.32.3
|
153 |
+
rfc3339-validator==0.1.4
|
154 |
+
rfc3986-validator==0.1.1
|
155 |
+
rouge_score==0.1.2
|
156 |
+
rpds-py==0.20.0
|
157 |
+
safetensors==0.4.4
|
158 |
+
sentencepiece==0.2.0
|
159 |
+
sentry-sdk==2.13.0
|
160 |
+
setproctitle==1.3.3
|
161 |
+
setuptools==73.0.1
|
162 |
+
six==1.16.0
|
163 |
+
smmap==5.0.1
|
164 |
+
sniffio==1.3.1
|
165 |
+
soupsieve==2.6
|
166 |
+
stack-data==0.6.3
|
167 |
+
sympy==1.13.2
|
168 |
+
terminado==0.18.1
|
169 |
+
tinycss2==1.3.0
|
170 |
+
tokenizers==0.19.1
|
171 |
+
torch==2.4.0
|
172 |
+
torchaudio==2.4.0
|
173 |
+
torchvision==0.19.0
|
174 |
+
tornado==6.4.1
|
175 |
+
tqdm==4.66.5
|
176 |
+
traitlets==5.14.3
|
177 |
+
transformers==4.44.2
|
178 |
+
triton==3.0.0
|
179 |
+
types-python-dateutil==2.9.0.20240821
|
180 |
+
typing_extensions==4.12.2
|
181 |
+
tzdata==2024.1
|
182 |
+
uri-template==1.3.0
|
183 |
+
urllib3==2.2.2
|
184 |
+
wadllib==1.3.6
|
185 |
+
wandb==0.17.8
|
186 |
+
wcwidth==0.2.13
|
187 |
+
webcolors==24.8.0
|
188 |
+
webencodings==0.5.1
|
189 |
+
websocket-client==1.8.0
|
190 |
+
wheel==0.44.0
|
191 |
+
widgetsnbextension==4.0.11
|
192 |
+
wmctrl==0.5
|
193 |
+
xxhash==3.5.0
|
194 |
+
yarl==1.9.7
|
195 |
+
zipp==1.0.0
|
checkpoints/wandb/run-20240902_170304-v43qltex/files/wandb-metadata.json
ADDED
@@ -0,0 +1,527 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.4.0-171-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.11.9",
|
4 |
+
"heartbeatAt": "2024-09-02T17:03:05.625568",
|
5 |
+
"startedAt": "2024-09-02T17:03:04.729800",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [],
|
9 |
+
"state": "running",
|
10 |
+
"program": "-m nanoT5.main",
|
11 |
+
"codePathLocal": null,
|
12 |
+
"git": {
|
13 |
+
"remote": "https://github.com/pszemraj/nanoT5.git",
|
14 |
+
"commit": "7e55b4db2270303afebba4e0d389b68979943c0c"
|
15 |
+
},
|
16 |
+
"email": null,
|
17 |
+
"root": "/workspace/nanoT5",
|
18 |
+
"host": "f8d7d6f6310f",
|
19 |
+
"username": "root",
|
20 |
+
"executable": "/usr/bin/python",
|
21 |
+
"cpu_count": 48,
|
22 |
+
"cpu_count_logical": 96,
|
23 |
+
"cpu_freq": {
|
24 |
+
"current": 1001.9331562499998,
|
25 |
+
"min": 800.0,
|
26 |
+
"max": 2801.0
|
27 |
+
},
|
28 |
+
"cpu_freq_per_core": [
|
29 |
+
{
|
30 |
+
"current": 900.0,
|
31 |
+
"min": 800.0,
|
32 |
+
"max": 2801.0
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"current": 799.402,
|
36 |
+
"min": 800.0,
|
37 |
+
"max": 2801.0
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"current": 800.0,
|
41 |
+
"min": 800.0,
|
42 |
+
"max": 2801.0
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"current": 799.398,
|
46 |
+
"min": 800.0,
|
47 |
+
"max": 2801.0
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"current": 800.0,
|
51 |
+
"min": 800.0,
|
52 |
+
"max": 2801.0
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"current": 1037.385,
|
56 |
+
"min": 800.0,
|
57 |
+
"max": 2801.0
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"current": 800.0,
|
61 |
+
"min": 800.0,
|
62 |
+
"max": 2801.0
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"current": 872.196,
|
66 |
+
"min": 800.0,
|
67 |
+
"max": 2801.0
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"current": 800.0,
|
71 |
+
"min": 800.0,
|
72 |
+
"max": 2801.0
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"current": 1072.759,
|
76 |
+
"min": 800.0,
|
77 |
+
"max": 2801.0
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"current": 882.854,
|
81 |
+
"min": 800.0,
|
82 |
+
"max": 2801.0
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"current": 823.861,
|
86 |
+
"min": 800.0,
|
87 |
+
"max": 2801.0
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"current": 823.212,
|
91 |
+
"min": 800.0,
|
92 |
+
"max": 2801.0
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"current": 800.0,
|
96 |
+
"min": 800.0,
|
97 |
+
"max": 2801.0
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"current": 799.39,
|
101 |
+
"min": 800.0,
|
102 |
+
"max": 2801.0
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"current": 823.065,
|
106 |
+
"min": 800.0,
|
107 |
+
"max": 2801.0
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"current": 1398.392,
|
111 |
+
"min": 800.0,
|
112 |
+
"max": 2801.0
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"current": 799.401,
|
116 |
+
"min": 800.0,
|
117 |
+
"max": 2801.0
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"current": 800.0,
|
121 |
+
"min": 800.0,
|
122 |
+
"max": 2801.0
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"current": 824.357,
|
126 |
+
"min": 800.0,
|
127 |
+
"max": 2801.0
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"current": 800.0,
|
131 |
+
"min": 800.0,
|
132 |
+
"max": 2801.0
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"current": 2022.695,
|
136 |
+
"min": 800.0,
|
137 |
+
"max": 2801.0
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"current": 3300.0,
|
141 |
+
"min": 800.0,
|
142 |
+
"max": 2801.0
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"current": 800.0,
|
146 |
+
"min": 800.0,
|
147 |
+
"max": 2801.0
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"current": 823.663,
|
151 |
+
"min": 800.0,
|
152 |
+
"max": 2801.0
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"current": 800.0,
|
156 |
+
"min": 800.0,
|
157 |
+
"max": 2801.0
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"current": 804.641,
|
161 |
+
"min": 800.0,
|
162 |
+
"max": 2801.0
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"current": 873.253,
|
166 |
+
"min": 800.0,
|
167 |
+
"max": 2801.0
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"current": 872.742,
|
171 |
+
"min": 800.0,
|
172 |
+
"max": 2801.0
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"current": 800.0,
|
176 |
+
"min": 800.0,
|
177 |
+
"max": 2801.0
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"current": 800.0,
|
181 |
+
"min": 800.0,
|
182 |
+
"max": 2801.0
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"current": 799.845,
|
186 |
+
"min": 800.0,
|
187 |
+
"max": 2801.0
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"current": 799.422,
|
191 |
+
"min": 800.0,
|
192 |
+
"max": 2801.0
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"current": 800.0,
|
196 |
+
"min": 800.0,
|
197 |
+
"max": 2801.0
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"current": 900.0,
|
201 |
+
"min": 800.0,
|
202 |
+
"max": 2801.0
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"current": 799.957,
|
206 |
+
"min": 800.0,
|
207 |
+
"max": 2801.0
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"current": 1100.0,
|
211 |
+
"min": 800.0,
|
212 |
+
"max": 2801.0
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"current": 3351.928,
|
216 |
+
"min": 800.0,
|
217 |
+
"max": 2801.0
|
218 |
+
},
|
219 |
+
{
|
220 |
+
"current": 800.0,
|
221 |
+
"min": 800.0,
|
222 |
+
"max": 2801.0
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"current": 799.426,
|
226 |
+
"min": 800.0,
|
227 |
+
"max": 2801.0
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"current": 870.565,
|
231 |
+
"min": 800.0,
|
232 |
+
"max": 2801.0
|
233 |
+
},
|
234 |
+
{
|
235 |
+
"current": 800.192,
|
236 |
+
"min": 800.0,
|
237 |
+
"max": 2801.0
|
238 |
+
},
|
239 |
+
{
|
240 |
+
"current": 800.057,
|
241 |
+
"min": 800.0,
|
242 |
+
"max": 2801.0
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"current": 2799.997,
|
246 |
+
"min": 800.0,
|
247 |
+
"max": 2801.0
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"current": 800.0,
|
251 |
+
"min": 800.0,
|
252 |
+
"max": 2801.0
|
253 |
+
},
|
254 |
+
{
|
255 |
+
"current": 959.262,
|
256 |
+
"min": 800.0,
|
257 |
+
"max": 2801.0
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"current": 2801.291,
|
261 |
+
"min": 800.0,
|
262 |
+
"max": 2801.0
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"current": 799.425,
|
266 |
+
"min": 800.0,
|
267 |
+
"max": 2801.0
|
268 |
+
},
|
269 |
+
{
|
270 |
+
"current": 900.0,
|
271 |
+
"min": 800.0,
|
272 |
+
"max": 2801.0
|
273 |
+
},
|
274 |
+
{
|
275 |
+
"current": 800.683,
|
276 |
+
"min": 800.0,
|
277 |
+
"max": 2801.0
|
278 |
+
},
|
279 |
+
{
|
280 |
+
"current": 800.0,
|
281 |
+
"min": 800.0,
|
282 |
+
"max": 2801.0
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"current": 800.697,
|
286 |
+
"min": 800.0,
|
287 |
+
"max": 2801.0
|
288 |
+
},
|
289 |
+
{
|
290 |
+
"current": 800.876,
|
291 |
+
"min": 800.0,
|
292 |
+
"max": 2801.0
|
293 |
+
},
|
294 |
+
{
|
295 |
+
"current": 800.0,
|
296 |
+
"min": 800.0,
|
297 |
+
"max": 2801.0
|
298 |
+
},
|
299 |
+
{
|
300 |
+
"current": 800.0,
|
301 |
+
"min": 800.0,
|
302 |
+
"max": 2801.0
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"current": 800.0,
|
306 |
+
"min": 800.0,
|
307 |
+
"max": 2801.0
|
308 |
+
},
|
309 |
+
{
|
310 |
+
"current": 800.0,
|
311 |
+
"min": 800.0,
|
312 |
+
"max": 2801.0
|
313 |
+
},
|
314 |
+
{
|
315 |
+
"current": 800.0,
|
316 |
+
"min": 800.0,
|
317 |
+
"max": 2801.0
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"current": 800.741,
|
321 |
+
"min": 800.0,
|
322 |
+
"max": 2801.0
|
323 |
+
},
|
324 |
+
{
|
325 |
+
"current": 800.0,
|
326 |
+
"min": 800.0,
|
327 |
+
"max": 2801.0
|
328 |
+
},
|
329 |
+
{
|
330 |
+
"current": 800.0,
|
331 |
+
"min": 800.0,
|
332 |
+
"max": 2801.0
|
333 |
+
},
|
334 |
+
{
|
335 |
+
"current": 800.0,
|
336 |
+
"min": 800.0,
|
337 |
+
"max": 2801.0
|
338 |
+
},
|
339 |
+
{
|
340 |
+
"current": 800.0,
|
341 |
+
"min": 800.0,
|
342 |
+
"max": 2801.0
|
343 |
+
},
|
344 |
+
{
|
345 |
+
"current": 800.0,
|
346 |
+
"min": 800.0,
|
347 |
+
"max": 2801.0
|
348 |
+
},
|
349 |
+
{
|
350 |
+
"current": 942.364,
|
351 |
+
"min": 800.0,
|
352 |
+
"max": 2801.0
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"current": 800.344,
|
356 |
+
"min": 800.0,
|
357 |
+
"max": 2801.0
|
358 |
+
},
|
359 |
+
{
|
360 |
+
"current": 799.272,
|
361 |
+
"min": 800.0,
|
362 |
+
"max": 2801.0
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"current": 800.0,
|
366 |
+
"min": 800.0,
|
367 |
+
"max": 2801.0
|
368 |
+
},
|
369 |
+
{
|
370 |
+
"current": 800.0,
|
371 |
+
"min": 800.0,
|
372 |
+
"max": 2801.0
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"current": 3300.0,
|
376 |
+
"min": 800.0,
|
377 |
+
"max": 2801.0
|
378 |
+
},
|
379 |
+
{
|
380 |
+
"current": 3304.817,
|
381 |
+
"min": 800.0,
|
382 |
+
"max": 2801.0
|
383 |
+
},
|
384 |
+
{
|
385 |
+
"current": 800.103,
|
386 |
+
"min": 800.0,
|
387 |
+
"max": 2801.0
|
388 |
+
},
|
389 |
+
{
|
390 |
+
"current": 800.363,
|
391 |
+
"min": 800.0,
|
392 |
+
"max": 2801.0
|
393 |
+
},
|
394 |
+
{
|
395 |
+
"current": 800.727,
|
396 |
+
"min": 800.0,
|
397 |
+
"max": 2801.0
|
398 |
+
},
|
399 |
+
{
|
400 |
+
"current": 900.0,
|
401 |
+
"min": 800.0,
|
402 |
+
"max": 2801.0
|
403 |
+
},
|
404 |
+
{
|
405 |
+
"current": 900.0,
|
406 |
+
"min": 800.0,
|
407 |
+
"max": 2801.0
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"current": 900.0,
|
411 |
+
"min": 800.0,
|
412 |
+
"max": 2801.0
|
413 |
+
},
|
414 |
+
{
|
415 |
+
"current": 811.831,
|
416 |
+
"min": 800.0,
|
417 |
+
"max": 2801.0
|
418 |
+
},
|
419 |
+
{
|
420 |
+
"current": 799.938,
|
421 |
+
"min": 800.0,
|
422 |
+
"max": 2801.0
|
423 |
+
},
|
424 |
+
{
|
425 |
+
"current": 800.0,
|
426 |
+
"min": 800.0,
|
427 |
+
"max": 2801.0
|
428 |
+
},
|
429 |
+
{
|
430 |
+
"current": 801.226,
|
431 |
+
"min": 800.0,
|
432 |
+
"max": 2801.0
|
433 |
+
},
|
434 |
+
{
|
435 |
+
"current": 799.947,
|
436 |
+
"min": 800.0,
|
437 |
+
"max": 2801.0
|
438 |
+
},
|
439 |
+
{
|
440 |
+
"current": 900.0,
|
441 |
+
"min": 800.0,
|
442 |
+
"max": 2801.0
|
443 |
+
},
|
444 |
+
{
|
445 |
+
"current": 800.0,
|
446 |
+
"min": 800.0,
|
447 |
+
"max": 2801.0
|
448 |
+
},
|
449 |
+
{
|
450 |
+
"current": 980.682,
|
451 |
+
"min": 800.0,
|
452 |
+
"max": 2801.0
|
453 |
+
},
|
454 |
+
{
|
455 |
+
"current": 3308.926,
|
456 |
+
"min": 800.0,
|
457 |
+
"max": 2801.0
|
458 |
+
},
|
459 |
+
{
|
460 |
+
"current": 801.074,
|
461 |
+
"min": 800.0,
|
462 |
+
"max": 2801.0
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"current": 800.709,
|
466 |
+
"min": 800.0,
|
467 |
+
"max": 2801.0
|
468 |
+
},
|
469 |
+
{
|
470 |
+
"current": 804.122,
|
471 |
+
"min": 800.0,
|
472 |
+
"max": 2801.0
|
473 |
+
},
|
474 |
+
{
|
475 |
+
"current": 801.051,
|
476 |
+
"min": 800.0,
|
477 |
+
"max": 2801.0
|
478 |
+
},
|
479 |
+
{
|
480 |
+
"current": 805.622,
|
481 |
+
"min": 800.0,
|
482 |
+
"max": 2801.0
|
483 |
+
},
|
484 |
+
{
|
485 |
+
"current": 2800.0,
|
486 |
+
"min": 800.0,
|
487 |
+
"max": 2801.0
|
488 |
+
},
|
489 |
+
{
|
490 |
+
"current": 799.951,
|
491 |
+
"min": 800.0,
|
492 |
+
"max": 2801.0
|
493 |
+
},
|
494 |
+
{
|
495 |
+
"current": 800.0,
|
496 |
+
"min": 800.0,
|
497 |
+
"max": 2801.0
|
498 |
+
},
|
499 |
+
{
|
500 |
+
"current": 2802.488,
|
501 |
+
"min": 800.0,
|
502 |
+
"max": 2801.0
|
503 |
+
},
|
504 |
+
{
|
505 |
+
"current": 801.049,
|
506 |
+
"min": 800.0,
|
507 |
+
"max": 2801.0
|
508 |
+
}
|
509 |
+
],
|
510 |
+
"disk": {
|
511 |
+
"/": {
|
512 |
+
"total": 200.0,
|
513 |
+
"used": 1.4021186828613281
|
514 |
+
}
|
515 |
+
},
|
516 |
+
"gpu": "NVIDIA A40",
|
517 |
+
"gpu_count": 1,
|
518 |
+
"gpu_devices": [
|
519 |
+
{
|
520 |
+
"name": "NVIDIA A40",
|
521 |
+
"memory_total": 48305799168
|
522 |
+
}
|
523 |
+
],
|
524 |
+
"memory": {
|
525 |
+
"total": 503.5313262939453
|
526 |
+
}
|
527 |
+
}
|
checkpoints/wandb/run-20240902_170304-v43qltex/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"train/loss": 42.73570467710495, "train/grad_l2": 1292.6591796875, "train/weights_l2": 7206.464049045997, "train/lr": 0.019987049260593165, "train/seconds_per_step": 3.309874153137207, "_timestamp": 1725333143.7322862, "_runtime": 36558.970363140106, "_step": 10900, "_wandb": {"runtime": 36838}}
|
checkpoints/wandb/run-20240902_170304-v43qltex/logs/debug-internal.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoints/wandb/run-20240902_170304-v43qltex/logs/debug.log
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Current SDK version is 0.17.8
|
2 |
+
2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Configure stats pid to 6499
|
3 |
+
2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
|
4 |
+
2024-09-02 17:03:04,741 INFO MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/settings
|
5 |
+
2024-09-02 17:03:04,742 INFO MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
|
6 |
+
2024-09-02 17:03:04,742 INFO MainThread:6499 [wandb_setup.py:_flush():77] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-09-02 17:03:04,742 WARNING MainThread:6499 [wandb_setup.py:_flush():77] Could not find program at -m nanoT5.main
|
8 |
+
2024-09-02 17:03:04,743 INFO MainThread:6499 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
|
9 |
+
2024-09-02 17:03:04,743 INFO MainThread:6499 [wandb_setup.py:_flush():77] Applying login settings: {}
|
10 |
+
2024-09-02 17:03:04,743 INFO MainThread:6499 [wandb_init.py:_log_setup():524] Logging user logs to /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/run-20240902_170304-v43qltex/logs/debug.log
|
11 |
+
2024-09-02 17:03:04,744 INFO MainThread:6499 [wandb_init.py:_log_setup():525] Logging internal logs to /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/run-20240902_170304-v43qltex/logs/debug-internal.log
|
12 |
+
2024-09-02 17:03:04,744 INFO MainThread:6499 [wandb_init.py:init():607] calling init triggers
|
13 |
+
2024-09-02 17:03:04,744 INFO MainThread:6499 [wandb_init.py:init():614] wandb.init called with sweep_config: {}
|
14 |
+
config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 2137, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'model': {'klass': 'custom_seq2seq', 'name': 'google/t5-v1_1-base', 'overwrite': None, 'add_config': None, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 8}, 'optim': {'name': 'adamwscale', 'base_lr': 0.02, 'batch_size': 64, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 10000, 'lr_scheduler': 'cosine', 'weight_decay': 0.001, 'grad_clip': 1.0, 'grad_acc': 4, 'final_cosine': 1e-05}, 'eval': {'every_steps': 100000, 'steps': 500}, 'checkpoint': {'every_steps': 5000}, 'logging': {'every_steps': 100, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nano-custom-seq2seq', 'entity': 'amazingvince', 'tags': ['nanoT5', 'my_tag'], 'mode': 'online'}}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/logs/2024-09-02/17-03-02'}
|
15 |
+
2024-09-02 17:03:04,745 INFO MainThread:6499 [wandb_init.py:init():657] starting backend
|
16 |
+
2024-09-02 17:03:04,745 INFO MainThread:6499 [wandb_init.py:init():661] setting up manager
|
17 |
+
2024-09-02 17:03:04,760 INFO MainThread:6499 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
18 |
+
2024-09-02 17:03:04,761 INFO MainThread:6499 [wandb_init.py:init():669] backend started and connected
|
19 |
+
2024-09-02 17:03:04,776 INFO MainThread:6499 [wandb_init.py:init():767] updated telemetry
|
20 |
+
2024-09-02 17:03:04,819 INFO MainThread:6499 [wandb_init.py:init():800] communicating run to backend with 90.0 second timeout
|
21 |
+
2024-09-02 17:03:05,519 INFO MainThread:6499 [wandb_init.py:init():851] starting run threads in backend
|
22 |
+
2024-09-02 17:03:05,817 INFO MainThread:6499 [wandb_run.py:_console_start():2463] atexit reg
|
23 |
+
2024-09-02 17:03:05,818 INFO MainThread:6499 [wandb_run.py:_redirect():2309] redirect: wrap_raw
|
24 |
+
2024-09-02 17:03:05,819 INFO MainThread:6499 [wandb_run.py:_redirect():2374] Wrapping output streams.
|
25 |
+
2024-09-02 17:03:05,819 INFO MainThread:6499 [wandb_run.py:_redirect():2399] Redirects installed.
|
26 |
+
2024-09-02 17:03:05,822 INFO MainThread:6499 [wandb_init.py:init():894] run started, returning control to user process
|
27 |
+
2024-09-02 17:03:35,512 INFO MainThread:6499 [wandb_run.py:_config_callback():1392] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 2137, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'model': {'klass': 'custom_seq2seq', 'name': 'google/t5-v1_1-base', 'overwrite': None, 'add_config': None, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 8, 'before_mask_input_length': 1137, 'target_length': 229}, 'optim': {'name': 'adamwscale', 'base_lr': 0.02, 'batch_size': 64, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 10000, 'lr_scheduler': 'cosine', 'weight_decay': 0.001, 'grad_clip': 1.0, 'grad_acc': 4, 'final_cosine': 1e-05}, 'eval': {'every_steps': 100000, 'steps': 500, 'corrected_steps': 500}, 'checkpoint': {'every_steps': 5000}, 'logging': {'every_steps': 100, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nano-custom-seq2seq', 'entity': 'amazingvince', 'tags': ['nanoT5', 'my_tag'], 'mode': 'online'}}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/logs/2024-09-02/17-03-02', 'n_all_param': 673076736}
|
28 |
+
2024-09-03 03:17:10,763 WARNING MsgRouterThr:6499 [router.py:message_loop():77] message_loop has been closed
|
checkpoints/wandb/run-20240902_170304-v43qltex/run-v43qltex.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c992f1fa0f2152d9b2336fede3ebcc8b26d60d54a1945da3d2cecab44ce3ab70
|
3 |
+
size 4157001
|