koichi12 commited on
Commit
09e5c81
·
verified ·
1 Parent(s): d3983b6

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. wandb/run-20240804_021032-cd2cg2ui/files/output.log +11 -0
  2. wandb/run-20240804_021032-cd2cg2ui/files/wandb-metadata.json +215 -0
  3. wandb/run-20240804_021032-cd2cg2ui/files/wandb-summary.json +1 -0
  4. wandb/run-20240804_021444-pk5j08lr/files/config.yaml +335 -0
  5. wandb/run-20240804_021444-pk5j08lr/files/output.log +103 -0
  6. wandb/run-20240804_021444-pk5j08lr/files/requirements.txt +271 -0
  7. wandb/run-20240804_021444-pk5j08lr/files/wandb-metadata.json +215 -0
  8. wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json +1 -0
  9. wandb/run-20240804_021444-pk5j08lr/logs/debug-internal.log +191 -0
  10. wandb/run-20240804_021444-pk5j08lr/logs/debug.log +30 -0
  11. wandb/run-20240804_021444-pk5j08lr/run-pk5j08lr.wandb +0 -0
  12. wandb/run-20240804_144007-dds6qqbt/files/config.yaml +335 -0
  13. wandb/run-20240804_144007-dds6qqbt/files/output.log +135 -0
  14. wandb/run-20240804_144007-dds6qqbt/files/requirements.txt +271 -0
  15. wandb/run-20240804_144007-dds6qqbt/files/wandb-metadata.json +215 -0
  16. wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json +1 -0
  17. wandb/run-20240804_144007-dds6qqbt/logs/debug-internal.log +186 -0
  18. wandb/run-20240804_144007-dds6qqbt/logs/debug.log +30 -0
  19. wandb/run-20240804_144007-dds6qqbt/run-dds6qqbt.wandb +0 -0
  20. wandb/run-20240804_222226-kh5katc1/files/config.yaml +335 -0
  21. wandb/run-20240804_222226-kh5katc1/files/output.log +468 -0
  22. wandb/run-20240804_222226-kh5katc1/files/requirements.txt +271 -0
  23. wandb/run-20240804_222226-kh5katc1/files/wandb-metadata.json +215 -0
  24. wandb/run-20240804_222226-kh5katc1/files/wandb-summary.json +1 -0
  25. wandb/run-20240804_222226-kh5katc1/logs/debug-internal.log +0 -0
  26. wandb/run-20240804_222226-kh5katc1/logs/debug.log +30 -0
  27. wandb/run-20240812_063447-whqmtxyq/files/config.yaml +335 -0
  28. wandb/run-20240812_063447-whqmtxyq/files/output.log +144 -0
  29. wandb/run-20240812_063447-whqmtxyq/files/requirements.txt +271 -0
  30. wandb/run-20240812_063447-whqmtxyq/files/wandb-metadata.json +215 -0
  31. wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json +1 -0
  32. wandb/run-20240812_063447-whqmtxyq/logs/debug-internal.log +359 -0
  33. wandb/run-20240812_063447-whqmtxyq/logs/debug.log +30 -0
  34. wandb/run-20240812_063447-whqmtxyq/run-whqmtxyq.wandb +0 -0
  35. wandb/run-20240815_031216-0szn78ph/files/config.yaml +335 -0
  36. wandb/run-20240815_031216-0szn78ph/files/output.log +92 -0
  37. wandb/run-20240815_031216-0szn78ph/files/requirements.txt +293 -0
  38. wandb/run-20240815_031216-0szn78ph/files/wandb-metadata.json +215 -0
  39. wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json +1 -0
  40. wandb/run-20240815_031216-0szn78ph/logs/debug-internal.log +260 -0
  41. wandb/run-20240815_031216-0szn78ph/logs/debug.log +29 -0
  42. wandb/run-20240815_031216-0szn78ph/run-0szn78ph.wandb +0 -0
  43. wandb/run-20240823_162543-eroprw00/files/config.yaml +342 -0
  44. wandb/run-20240823_162543-eroprw00/files/output.log +116 -0
  45. wandb/run-20240823_162543-eroprw00/files/requirements.txt +375 -0
  46. wandb/run-20240823_162543-eroprw00/files/wandb-metadata.json +220 -0
  47. wandb/run-20240823_162543-eroprw00/files/wandb-summary.json +1 -0
  48. wandb/run-20240823_162543-eroprw00/logs/debug-internal.log +188 -0
  49. wandb/run-20240823_162543-eroprw00/logs/debug.log +30 -0
  50. wandb/run-20240823_162543-eroprw00/run-eroprw00.wandb +0 -0
wandb/run-20240804_021032-cd2cg2ui/files/output.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/tiny-mistral-sample.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ Traceback (most recent call last):
5
+ File "/project/examples/finetuning.py", line 13, in <module>
6
+ main()
7
+ File "/project/src/llama_recipes/finetuning.py", line 103, in main
8
+ model = get_model(
9
+ File "/project/src/llama_recipes/get_models.py", line 71, in get_model
10
+ assert sliding_window == 4096
11
+ AssertionError
wandb/run-20240804_021032-cd2cg2ui/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-03T17:10:33.458421",
5
+ "startedAt": "2024-08-03T17:10:32.395506",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "1024",
11
+ "--sliding-window-size",
12
+ "8192",
13
+ "--micro-batch-size",
14
+ "8",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/custom/tiny-mistral",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-mistral-sample",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-mistral-sample",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-mistral-sample",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-mistral-sample_train_2024-08-04-02:10:14"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.034,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.034,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.034,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.034,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.034,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.034,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.034,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.034,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.034,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.034,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.034,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.034,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.034,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.034,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.034,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.034,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.034,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.034,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.034,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48782730102539
214
+ }
215
+ }
wandb/run-20240804_021032-cd2cg2ui/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 0}}
wandb/run-20240804_021444-pk5j08lr/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 1024
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-mistral-sample_train_2024-08-04-02:14:34
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-mistral-sample
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-mistral-sample
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/custom/tiny-mistral
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 8
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-mistral-sample
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32768
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722705284.714592
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: silu
318
+ hidden_size:
319
+ desc: null
320
+ value: 256
321
+ model_type:
322
+ desc: null
323
+ value: mistral
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 1024
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 4
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 4
333
+ model_architecture:
334
+ desc: null
335
+ value: MistralForCausalLM
wandb/run-20240804_021444-pk5j08lr/files/output.log ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/tiny-mistral-sample.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
5
+ Loading model state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
6
+ Loaded model state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
7
+ --> Model /share/pretrained_lm/custom/tiny-mistral
8
+ --> /share/pretrained_lm/custom/tiny-mistral has 19.925248 Million params
9
+ BFloat16 enabled for mixed precision - using bfSixteen policy
10
+ --> applying fsdp activation checkpointing...
11
+ > datasets target sizes (minimum size):
12
+ train: 6400000
13
+ validation: 323200
14
+ test: 3200
15
+ > building train, validation, and test datasets for GPT ...
16
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
17
+ warnings.warn(
18
+ Let split = None
19
+ > finished creating GPT datasets ...
20
+ Loading optimizer state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
21
+ Loaded optimizer state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
22
+ model info: FullyShardedDataParallel(
23
+ (_fsdp_wrapped_module): MistralForCausalLM(
24
+ (model): MistralModel(
25
+ (embed_tokens): Embedding(32768, 256)
26
+ (layers): ModuleList(
27
+ (0-3): 4 x FullyShardedDataParallel(
28
+ (_fsdp_wrapped_module): CheckpointWrapper(
29
+ (_checkpoint_wrapped_module): MistralDecoderLayer(
30
+ (self_attn): MistralFlashAttention2(
31
+ (q_proj): Linear(in_features=256, out_features=512, bias=False)
32
+ (k_proj): Linear(in_features=256, out_features=256, bias=False)
33
+ (v_proj): Linear(in_features=256, out_features=256, bias=False)
34
+ (o_proj): Linear(in_features=512, out_features=256, bias=False)
35
+ (rotary_emb): MistralRotaryEmbedding()
36
+ )
37
+ (mlp): MistralMLP(
38
+ (gate_proj): Linear(in_features=256, out_features=512, bias=False)
39
+ (up_proj): Linear(in_features=256, out_features=512, bias=False)
40
+ (down_proj): Linear(in_features=512, out_features=256, bias=False)
41
+ (act_fn): SiLU()
42
+ )
43
+ (input_layernorm): MistralRMSNorm()
44
+ (post_attention_layernorm): MistralRMSNorm()
45
+ )
46
+ )
47
+ )
48
+ )
49
+ (norm): MistralRMSNorm()
50
+ )
51
+ (lm_head): Linear(in_features=256, out_features=32768, bias=False)
52
+ )
53
+ )
54
+ model config: MistralConfig {
55
+ "_name_or_path": "/share/pretrained_lm/custom/tiny-mistral",
56
+ "architectures": [
57
+ "MistralForCausalLM"
58
+ ],
59
+ "attention_dropout": 0.0,
60
+ "bos_token_id": 1,
61
+ "eos_token_id": 2,
62
+ "head_dim": 128,
63
+ "hidden_act": "silu",
64
+ "hidden_size": 256,
65
+ "initializer_range": 0.02,
66
+ "intermediate_size": 512,
67
+ "label_smoothing": 0.0,
68
+ "max_position_embeddings": 1024,
69
+ "model_type": "mistral",
70
+ "num_attention_heads": 4,
71
+ "num_hidden_layers": 4,
72
+ "num_key_value_heads": 2,
73
+ "rms_norm_eps": 1e-05,
74
+ "rope_theta": 1000000.0,
75
+ "sliding_window": 4096,
76
+ "tie_word_embeddings": false,
77
+ "torch_dtype": "float32",
78
+ "transformers_version": "4.43.3",
79
+ "use_cache": false,
80
+ "vocab_size": 32768
81
+ }
82
+ Saving checkpoint to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000
83
+ Saving model state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
84
+ Saved model state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
85
+ Saving optimizer state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
86
+ Saved optimizer state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
87
+ Saving scheduler state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/scheduler.pt
88
+ Saved scheduler state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/scheduler.pt
89
+ Saving RNG states to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/rng.pt
90
+ Saved RNG states to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/rng.pt
91
+ Saved checkpoint to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000, took 0.17s
92
+ Building a BlendedDataset for a single MegatronDataset
93
+ Unable to save the indexes because path_to_cache is None
94
+ Building a BlendedDataset for a single MegatronDataset
95
+ Unable to save the indexes because path_to_cache is None
96
+ Building a BlendedDataset for a single MegatronDataset
97
+ Unable to save the indexes because path_to_cache is None
98
+ [rank0]:[2024-08-04 02:14:50,842] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
99
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
100
+ warnings.warn(
101
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
102
+ warnings.warn(
103
+ [rank0]:[2024-08-04 02:14:50,959] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.0010300300018570852, 'preprocessing_with_comm': 0.0005270100009511225, 'state_converting': 0.021121047997439746, <Type.ALL: 'all'>: 0.022993901999143418})
wandb/run-20240804_021444-pk5j08lr/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240804_021444-pk5j08lr/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-03T17:14:45.302596",
5
+ "startedAt": "2024-08-03T17:14:44.702200",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "1024",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "8",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/custom/tiny-mistral",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-mistral-sample",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-mistral-sample",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-mistral-sample",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-mistral-sample_train_2024-08-04-02:14:34"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.034,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.034,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.034,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.034,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.034,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.034,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.034,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.034,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.034,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.034,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.034,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.034,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.034,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.034,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.034,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.034,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.034,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.034,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.034,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48782730102539
214
+ }
215
+ }
wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 5}}
wandb/run-20240804_021444-pk5j08lr/logs/debug-internal.log ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 02:14:44,716 INFO StreamThr :11553 [internal.py:wandb_internal():86] W&B internal server running at pid: 11553, started at: 2024-08-04 02:14:44.715209
2
+ 2024-08-04 02:14:44,717 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-04 02:14:44,719 INFO WriterThread:11553 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_021444-pk5j08lr/run-pk5j08lr.wandb
4
+ 2024-08-04 02:14:44,720 DEBUG SenderThread:11553 [sender.py:send():382] send: header
5
+ 2024-08-04 02:14:44,733 DEBUG SenderThread:11553 [sender.py:send():382] send: run
6
+ 2024-08-04 02:14:45,190 INFO SenderThread:11553 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_021444-pk5j08lr/files
7
+ 2024-08-04 02:14:45,190 INFO SenderThread:11553 [sender.py:_start_run_threads():1136] run started: pk5j08lr with start time 1722705284.714592
8
+ 2024-08-04 02:14:45,195 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-04 02:14:45,195 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-04 02:14:45,280 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-04 02:14:45,286 DEBUG HandlerThread:11553 [system_info.py:__init__():27] System info init
12
+ 2024-08-04 02:14:45,286 DEBUG HandlerThread:11553 [system_info.py:__init__():42] System info init done
13
+ 2024-08-04 02:14:45,286 INFO HandlerThread:11553 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-04 02:14:45,286 INFO SystemMonitor:11553 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-04 02:14:45,287 INFO HandlerThread:11553 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-04 02:14:45,287 INFO SystemMonitor:11553 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-04 02:14:45,288 INFO SystemMonitor:11553 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-04 02:14:45,289 INFO SystemMonitor:11553 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-04 02:14:45,289 INFO SystemMonitor:11553 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-04 02:14:45,290 INFO SystemMonitor:11553 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-04 02:14:45,302 DEBUG HandlerThread:11553 [system_info.py:probe():151] Probing system
22
+ 2024-08-04 02:14:45,304 DEBUG HandlerThread:11553 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-04 02:14:45,315 DEBUG HandlerThread:11553 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-04 02:14:45,315 DEBUG HandlerThread:11553 [system_info.py:probe():199] Probing system done
25
+ 2024-08-04 02:14:45,315 DEBUG HandlerThread:11553 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-03T17:14:45.302596', 'startedAt': '2024-08-03T17:14:44.702200', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1024', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/custom/tiny-mistral', '--save', '/work/llm_recipes/models/tiny-mistral-sample', '--load', '/work/llm_recipes/models/tiny-mistral-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-mistral-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-mistral-sample_train_2024-08-04-02:14:34'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
26
+ 2024-08-04 02:14:45,315 INFO HandlerThread:11553 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-04 02:14:45,316 INFO HandlerThread:11553 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-04 02:14:45,317 INFO HandlerThread:11553 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-04 02:14:45,323 DEBUG SenderThread:11553 [sender.py:send():382] send: files
30
+ 2024-08-04 02:14:45,323 INFO SenderThread:11553 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-04 02:14:45,332 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-04 02:14:45,332 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-04 02:14:45,332 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: python_packages
34
+ 2024-08-04 02:14:45,333 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: internal_messages
35
+ 2024-08-04 02:14:45,334 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-04 02:14:45,580 DEBUG SenderThread:11553 [sender.py:send():382] send: telemetry
37
+ 2024-08-04 02:14:46,067 INFO wandb-upload_0:11553 [upload_job.py:push():131] Uploaded file /tmp/tmp8oqwu4dewandb/gzg3ga4a-wandb-metadata.json
38
+ 2024-08-04 02:14:46,191 INFO Thread-12 :11553 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-metadata.json
39
+ 2024-08-04 02:14:46,192 INFO Thread-12 :11553 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
40
+ 2024-08-04 02:14:46,192 INFO Thread-12 :11553 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021444-pk5j08lr/files/requirements.txt
41
+ 2024-08-04 02:14:48,192 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
42
+ 2024-08-04 02:14:49,192 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
43
+ 2024-08-04 02:14:50,179 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: status_report
44
+ 2024-08-04 02:14:50,193 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
45
+ 2024-08-04 02:14:50,882 DEBUG SenderThread:11553 [sender.py:send():382] send: config
46
+ 2024-08-04 02:14:50,882 DEBUG SenderThread:11553 [sender.py:send():382] send: config
47
+ 2024-08-04 02:14:51,067 DEBUG SenderThread:11553 [sender.py:send():382] send: exit
48
+ 2024-08-04 02:14:51,067 INFO SenderThread:11553 [sender.py:send_exit():589] handling exit code: 0
49
+ 2024-08-04 02:14:51,067 INFO SenderThread:11553 [sender.py:send_exit():591] handling runtime: 5
50
+ 2024-08-04 02:14:51,068 INFO SenderThread:11553 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
51
+ 2024-08-04 02:14:51,068 INFO SenderThread:11553 [sender.py:send_exit():597] send defer
52
+ 2024-08-04 02:14:51,068 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
53
+ 2024-08-04 02:14:51,068 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 0
54
+ 2024-08-04 02:14:51,069 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
55
+ 2024-08-04 02:14:51,069 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 0
56
+ 2024-08-04 02:14:51,069 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 1
57
+ 2024-08-04 02:14:51,069 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
58
+ 2024-08-04 02:14:51,069 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 1
59
+ 2024-08-04 02:14:51,069 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
60
+ 2024-08-04 02:14:51,069 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 1
61
+ 2024-08-04 02:14:51,069 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 2
62
+ 2024-08-04 02:14:51,069 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
63
+ 2024-08-04 02:14:51,069 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 2
64
+ 2024-08-04 02:14:51,069 INFO HandlerThread:11553 [system_monitor.py:finish():203] Stopping system monitor
65
+ 2024-08-04 02:14:51,069 DEBUG SystemMonitor:11553 [system_monitor.py:_start():172] Starting system metrics aggregation loop
66
+ 2024-08-04 02:14:51,070 INFO HandlerThread:11553 [interfaces.py:finish():202] Joined cpu monitor
67
+ 2024-08-04 02:14:51,070 DEBUG SystemMonitor:11553 [system_monitor.py:_start():179] Finished system metrics aggregation loop
68
+ 2024-08-04 02:14:51,070 INFO HandlerThread:11553 [interfaces.py:finish():202] Joined disk monitor
69
+ 2024-08-04 02:14:51,070 DEBUG SystemMonitor:11553 [system_monitor.py:_start():183] Publishing last batch of metrics
70
+ 2024-08-04 02:14:51,103 INFO HandlerThread:11553 [interfaces.py:finish():202] Joined gpu monitor
71
+ 2024-08-04 02:14:51,103 INFO HandlerThread:11553 [interfaces.py:finish():202] Joined memory monitor
72
+ 2024-08-04 02:14:51,103 INFO HandlerThread:11553 [interfaces.py:finish():202] Joined network monitor
73
+ 2024-08-04 02:14:51,104 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
74
+ 2024-08-04 02:14:51,104 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 2
75
+ 2024-08-04 02:14:51,104 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 3
76
+ 2024-08-04 02:14:51,104 DEBUG SenderThread:11553 [sender.py:send():382] send: stats
77
+ 2024-08-04 02:14:51,104 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
78
+ 2024-08-04 02:14:51,104 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 3
79
+ 2024-08-04 02:14:51,104 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
80
+ 2024-08-04 02:14:51,104 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 3
81
+ 2024-08-04 02:14:51,105 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 4
82
+ 2024-08-04 02:14:51,105 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
83
+ 2024-08-04 02:14:51,105 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 4
84
+ 2024-08-04 02:14:51,105 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
85
+ 2024-08-04 02:14:51,105 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 4
86
+ 2024-08-04 02:14:51,105 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 5
87
+ 2024-08-04 02:14:51,105 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
88
+ 2024-08-04 02:14:51,105 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 5
89
+ 2024-08-04 02:14:51,105 DEBUG SenderThread:11553 [sender.py:send():382] send: summary
90
+ 2024-08-04 02:14:51,106 INFO SenderThread:11553 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
91
+ 2024-08-04 02:14:51,106 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
92
+ 2024-08-04 02:14:51,106 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 5
93
+ 2024-08-04 02:14:51,106 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 6
94
+ 2024-08-04 02:14:51,106 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
95
+ 2024-08-04 02:14:51,106 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 6
96
+ 2024-08-04 02:14:51,107 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
97
+ 2024-08-04 02:14:51,107 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 6
98
+ 2024-08-04 02:14:51,109 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: status_report
99
+ 2024-08-04 02:14:51,194 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
100
+ 2024-08-04 02:14:51,194 INFO Thread-12 :11553 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json
101
+ 2024-08-04 02:14:51,396 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 7
102
+ 2024-08-04 02:14:51,396 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
103
+ 2024-08-04 02:14:51,396 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 7
104
+ 2024-08-04 02:14:51,396 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
105
+ 2024-08-04 02:14:51,396 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 7
106
+ 2024-08-04 02:14:52,066 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
107
+ 2024-08-04 02:14:52,195 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/config.yaml
108
+ 2024-08-04 02:14:52,195 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
109
+ 2024-08-04 02:14:52,692 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 8
110
+ 2024-08-04 02:14:52,692 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
111
+ 2024-08-04 02:14:52,692 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
112
+ 2024-08-04 02:14:52,692 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 8
113
+ 2024-08-04 02:14:52,692 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
114
+ 2024-08-04 02:14:52,693 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 8
115
+ 2024-08-04 02:14:52,693 INFO SenderThread:11553 [job_builder.py:build():296] Attempting to build job artifact
116
+ 2024-08-04 02:14:52,693 INFO SenderThread:11553 [job_builder.py:_get_source_type():426] is repo sourced job
117
+ 2024-08-04 02:14:52,707 INFO SenderThread:11553 [job_builder.py:build():402] adding wandb-job metadata file
118
+ 2024-08-04 02:14:52,715 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 9
119
+ 2024-08-04 02:14:52,716 DEBUG SenderThread:11553 [sender.py:send():382] send: artifact
120
+ 2024-08-04 02:14:52,716 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
121
+ 2024-08-04 02:14:52,717 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 9
122
+ 2024-08-04 02:14:53,067 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
123
+ 2024-08-04 02:14:53,195 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
124
+ 2024-08-04 02:14:53,655 INFO SenderThread:11553 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
125
+ 2024-08-04 02:14:53,655 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
126
+ 2024-08-04 02:14:53,655 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 9
127
+ 2024-08-04 02:14:53,655 INFO SenderThread:11553 [dir_watcher.py:finish():358] shutting down directory watcher
128
+ 2024-08-04 02:14:54,196 INFO SenderThread:11553 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_021444-pk5j08lr/files
129
+ 2024-08-04 02:14:54,197 INFO SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/requirements.txt requirements.txt
130
+ 2024-08-04 02:14:54,197 INFO SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/config.yaml config.yaml
131
+ 2024-08-04 02:14:54,198 INFO SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-metadata.json wandb-metadata.json
132
+ 2024-08-04 02:14:54,198 INFO SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json wandb-summary.json
133
+ 2024-08-04 02:14:54,200 INFO SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log output.log
134
+ 2024-08-04 02:14:54,200 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 10
135
+ 2024-08-04 02:14:54,202 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
136
+ 2024-08-04 02:14:54,202 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
137
+ 2024-08-04 02:14:54,205 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 10
138
+ 2024-08-04 02:14:54,206 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
139
+ 2024-08-04 02:14:54,206 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 10
140
+ 2024-08-04 02:14:54,206 INFO SenderThread:11553 [file_pusher.py:finish():172] shutting down file pusher
141
+ 2024-08-04 02:14:54,605 INFO wandb-upload_1:11553 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021444-pk5j08lr/files/config.yaml
142
+ 2024-08-04 02:14:54,711 INFO wandb-upload_0:11553 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021444-pk5j08lr/files/requirements.txt
143
+ 2024-08-04 02:14:54,762 INFO wandb-upload_2:11553 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json
144
+ 2024-08-04 02:14:54,792 INFO wandb-upload_3:11553 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
145
+ 2024-08-04 02:14:54,992 INFO Thread-11 (_thread_body):11553 [sender.py:transition_state():617] send defer: 11
146
+ 2024-08-04 02:14:54,992 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
147
+ 2024-08-04 02:14:54,992 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 11
148
+ 2024-08-04 02:14:54,992 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
149
+ 2024-08-04 02:14:54,992 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 11
150
+ 2024-08-04 02:14:54,993 INFO SenderThread:11553 [file_pusher.py:join():178] waiting for file pusher
151
+ 2024-08-04 02:14:54,993 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 12
152
+ 2024-08-04 02:14:54,993 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
153
+ 2024-08-04 02:14:54,993 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 12
154
+ 2024-08-04 02:14:54,993 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
155
+ 2024-08-04 02:14:54,993 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 12
156
+ 2024-08-04 02:14:54,993 INFO SenderThread:11553 [file_stream.py:finish():595] file stream finish called
157
+ 2024-08-04 02:14:55,067 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
158
+ 2024-08-04 02:14:55,176 INFO SenderThread:11553 [file_stream.py:finish():599] file stream finish is done
159
+ 2024-08-04 02:14:55,176 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 13
160
+ 2024-08-04 02:14:55,176 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
161
+ 2024-08-04 02:14:55,176 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
162
+ 2024-08-04 02:14:55,177 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 13
163
+ 2024-08-04 02:14:55,177 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
164
+ 2024-08-04 02:14:55,177 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 13
165
+ 2024-08-04 02:14:55,177 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 14
166
+ 2024-08-04 02:14:55,177 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
167
+ 2024-08-04 02:14:55,177 DEBUG SenderThread:11553 [sender.py:send():382] send: final
168
+ 2024-08-04 02:14:55,177 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 14
169
+ 2024-08-04 02:14:55,177 DEBUG SenderThread:11553 [sender.py:send():382] send: footer
170
+ 2024-08-04 02:14:55,178 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
171
+ 2024-08-04 02:14:55,178 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 14
172
+ 2024-08-04 02:14:55,178 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
173
+ 2024-08-04 02:14:55,178 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
174
+ 2024-08-04 02:14:55,178 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
175
+ 2024-08-04 02:14:55,179 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
176
+ 2024-08-04 02:14:55,179 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: server_info
177
+ 2024-08-04 02:14:55,179 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: server_info
178
+ 2024-08-04 02:14:55,180 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: get_summary
179
+ 2024-08-04 02:14:55,181 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: sampled_history
180
+ 2024-08-04 02:14:55,181 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: internal_messages
181
+ 2024-08-04 02:14:55,181 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: job_info
182
+ 2024-08-04 02:14:55,346 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: job_info
183
+ 2024-08-04 02:14:55,346 INFO MainThread:11553 [wandb_run.py:_footer_history_summary_info():3866] rendering history
184
+ 2024-08-04 02:14:55,346 INFO MainThread:11553 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
185
+ 2024-08-04 02:14:55,346 INFO MainThread:11553 [wandb_run.py:_footer_sync_info():3825] logging synced files
186
+ 2024-08-04 02:14:55,346 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: shutdown
187
+ 2024-08-04 02:14:55,346 INFO HandlerThread:11553 [handler.py:finish():869] shutting down handler
188
+ 2024-08-04 02:14:56,181 INFO WriterThread:11553 [datastore.py:close():296] close: /project/wandb/run-20240804_021444-pk5j08lr/run-pk5j08lr.wandb
189
+ 2024-08-04 02:14:56,346 INFO SenderThread:11553 [sender.py:finish():1572] shutting down sender
190
+ 2024-08-04 02:14:56,346 INFO SenderThread:11553 [file_pusher.py:finish():172] shutting down file pusher
191
+ 2024-08-04 02:14:56,346 INFO SenderThread:11553 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240804_021444-pk5j08lr/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Configure stats pid to 11482
3
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
6
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_021444-pk5j08lr/logs/debug.log
9
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_021444-pk5j08lr/logs/debug-internal.log
10
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1024, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample_train_2024-08-04-02:14:34', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample', 'save': '/work/llm_recipes/models/tiny-mistral-sample', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
13
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_init.py:init():616] starting backend
14
+ 2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-04 02:14:44,713 INFO MainThread:11482 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-04 02:14:44,714 INFO MainThread:11482 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-04 02:14:44,719 INFO MainThread:11482 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-04 02:14:44,729 INFO MainThread:11482 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-04 02:14:45,194 INFO MainThread:11482 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-04 02:14:45,273 INFO MainThread:11482 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-04 02:14:45,274 INFO MainThread:11482 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-04 02:14:45,331 INFO MainThread:11482 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-04 02:14:45,331 INFO MainThread:11482 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-04 02:14:45,332 INFO MainThread:11482 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-04 02:14:45,332 INFO MainThread:11482 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-04 02:14:45,333 INFO MainThread:11482 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-04 02:14:50,881 INFO MainThread:11482 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 1024, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
29
+ 2024-08-04 02:14:50,881 INFO MainThread:11482 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-04 02:14:56,347 WARNING MsgRouterThr:11482 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240804_021444-pk5j08lr/run-pk5j08lr.wandb ADDED
Binary file (17.1 kB). View file
 
wandb/run-20240804_144007-dds6qqbt/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 512
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-llama_train_2024-08-04-14:39:57
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-llama
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-llama
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 2000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 2000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 8
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-llama
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32000
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722750007.607754
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: silu
318
+ hidden_size:
319
+ desc: null
320
+ value: 2048
321
+ model_type:
322
+ desc: null
323
+ value: llama
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 2048
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 32
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 22
333
+ model_architecture:
334
+ desc: null
335
+ value: LlamaForCausalLM
wandb/run-20240804_144007-dds6qqbt/files/output.log ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/tiny-llama.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
8
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
9
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
10
+ No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
11
+ --> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
12
+ --> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
13
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
14
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
15
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
16
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
17
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
18
+ warnings.warn(
19
+ BFloat16 enabled for mixed precision - using bfSixteen policy
20
+ --> applying fsdp activation checkpointing...
21
+ > datasets target sizes (minimum size):
22
+ train: 640000
23
+ validation: 35200
24
+ test: 3200
25
+ > building train, validation, and test datasets for GPT ...
26
+ > finished creating GPT datasets ...
27
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
28
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
29
+ No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
30
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
31
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
32
+ model info: FullyShardedDataParallel(
33
+ (_fsdp_wrapped_module): LlamaForCausalLM(
34
+ (model): LlamaModel(
35
+ (embed_tokens): Embedding(32000, 2048)
36
+ (layers): ModuleList(
37
+ (0-21): 22 x FullyShardedDataParallel(
38
+ (_fsdp_wrapped_module): CheckpointWrapper(
39
+ (_checkpoint_wrapped_module): LlamaDecoderLayer(
40
+ (self_attn): LlamaFlashAttention2(
41
+ (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
42
+ (k_proj): Linear(in_features=2048, out_features=256, bias=False)
43
+ (v_proj): Linear(in_features=2048, out_features=256, bias=False)
44
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
45
+ (rotary_emb): LlamaRotaryEmbedding()
46
+ )
47
+ (mlp): LlamaMLP(
48
+ (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
49
+ (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
50
+ (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
51
+ (act_fn): SiLU()
52
+ )
53
+ (input_layernorm): LlamaRMSNorm()
54
+ (post_attention_layernorm): LlamaRMSNorm()
55
+ )
56
+ )
57
+ )
58
+ )
59
+ (norm): LlamaRMSNorm()
60
+ (rotary_emb): LlamaRotaryEmbedding()
61
+ )
62
+ (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
63
+ )
64
+ )
65
+ model config: LlamaConfig {
66
+ "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
67
+ "architectures": [
68
+ "LlamaForCausalLM"
69
+ ],
70
+ "attention_bias": false,
71
+ "attention_dropout": 0.0,
72
+ "bos_token_id": 1,
73
+ "eos_token_id": 2,
74
+ "hidden_act": "silu",
75
+ "hidden_size": 2048,
76
+ "initializer_range": 0.02,
77
+ "intermediate_size": 5632,
78
+ "label_smoothing": 0.0,
79
+ "max_position_embeddings": 2048,
80
+ "mlp_bias": false,
81
+ "model_type": "llama",
82
+ "num_attention_heads": 32,
83
+ "num_hidden_layers": 22,
84
+ "num_key_value_heads": 4,
85
+ "pretraining_tp": 1,
86
+ "rms_norm_eps": 1e-05,
87
+ "rope_scaling": null,
88
+ "rope_theta": 10000.0,
89
+ "tie_word_embeddings": false,
90
+ "torch_dtype": "float32",
91
+ "transformers_version": "4.43.3",
92
+ "use_cache": false,
93
+ "vocab_size": 32000
94
+ }
95
+ Let split = None
96
+ Building a BlendedDataset for a single MegatronDataset
97
+ Unable to save the indexes because path_to_cache is None
98
+ Building a BlendedDataset for a single MegatronDataset
99
+ Unable to save the indexes because path_to_cache is None
100
+ Building a BlendedDataset for a single MegatronDataset
101
+ Unable to save the indexes because path_to_cache is None
102
+ Traceback (most recent call last):
103
+ File "/project/examples/finetuning.py", line 13, in <module>
104
+ main()
105
+ File "/project/src/llama_recipes/finetuning.py", line 281, in main
106
+ train(
107
+ File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
108
+ batch = next(train_dataloader)
109
+ File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
110
+ for x in iter:
111
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
112
+ data = self._next_data()
113
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
114
+ return self._process_data(data)
115
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
116
+ data.reraise()
117
+ File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
118
+ raise exception
119
+ RuntimeError: Caught RuntimeError in DataLoader worker process 0.
120
+ Original Traceback (most recent call last):
121
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
122
+ data = fetcher.fetch(index)
123
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
124
+ return self.collate_fn(data)
125
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
126
+ return collate(batch, collate_fn_map=default_collate_fn_map)
127
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
128
+ return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
129
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
130
+ return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
131
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
132
+ return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
133
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
134
+ return torch.stack(batch, 0, out=out)
135
+ RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1
wandb/run-20240804_144007-dds6qqbt/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240804_144007-dds6qqbt/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-04T05:40:08.224323",
5
+ "startedAt": "2024-08-04T05:40:07.595226",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "512",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "8",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "2000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "2000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-llama",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-llama",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-llama",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-llama_train_2024-08-04-14:39:57"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0389999999993,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.039,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48781967163086
214
+ }
215
+ }
wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 2}}
wandb/run-20240804_144007-dds6qqbt/logs/debug-internal.log ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 14:40:07,609 INFO StreamThr :11999 [internal.py:wandb_internal():86] W&B internal server running at pid: 11999, started at: 2024-08-04 14:40:07.608480
2
+ 2024-08-04 14:40:07,610 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-04 14:40:07,612 INFO WriterThread:11999 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_144007-dds6qqbt/run-dds6qqbt.wandb
4
+ 2024-08-04 14:40:07,613 DEBUG SenderThread:11999 [sender.py:send():382] send: header
5
+ 2024-08-04 14:40:07,627 DEBUG SenderThread:11999 [sender.py:send():382] send: run
6
+ 2024-08-04 14:40:08,110 INFO SenderThread:11999 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_144007-dds6qqbt/files
7
+ 2024-08-04 14:40:08,111 INFO SenderThread:11999 [sender.py:_start_run_threads():1136] run started: dds6qqbt with start time 1722750007.607754
8
+ 2024-08-04 14:40:08,116 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-04 14:40:08,116 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-04 14:40:08,204 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-04 14:40:08,210 DEBUG HandlerThread:11999 [system_info.py:__init__():27] System info init
12
+ 2024-08-04 14:40:08,210 DEBUG HandlerThread:11999 [system_info.py:__init__():42] System info init done
13
+ 2024-08-04 14:40:08,211 INFO HandlerThread:11999 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-04 14:40:08,211 INFO SystemMonitor:11999 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-04 14:40:08,211 INFO HandlerThread:11999 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-04 14:40:08,211 INFO SystemMonitor:11999 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-04 14:40:08,212 INFO SystemMonitor:11999 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-04 14:40:08,213 INFO SystemMonitor:11999 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-04 14:40:08,214 INFO SystemMonitor:11999 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-04 14:40:08,214 INFO SystemMonitor:11999 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-04 14:40:08,224 DEBUG HandlerThread:11999 [system_info.py:probe():151] Probing system
22
+ 2024-08-04 14:40:08,226 DEBUG HandlerThread:11999 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-04 14:40:08,238 DEBUG HandlerThread:11999 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-04 14:40:08,238 DEBUG HandlerThread:11999 [system_info.py:probe():199] Probing system done
25
+ 2024-08-04 14:40:08,238 DEBUG HandlerThread:11999 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:40:08.224323', 'startedAt': '2024-08-04T05:40:07.595226', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:39:57'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
26
+ 2024-08-04 14:40:08,238 INFO HandlerThread:11999 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-04 14:40:08,238 INFO HandlerThread:11999 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-04 14:40:08,239 INFO HandlerThread:11999 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-04 14:40:08,245 DEBUG SenderThread:11999 [sender.py:send():382] send: files
30
+ 2024-08-04 14:40:08,246 INFO SenderThread:11999 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-04 14:40:08,255 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-04 14:40:08,255 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-04 14:40:08,255 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-04 14:40:08,255 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-04 14:40:08,257 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-04 14:40:08,521 DEBUG SenderThread:11999 [sender.py:send():382] send: telemetry
37
+ 2024-08-04 14:40:08,889 INFO wandb-upload_0:11999 [upload_job.py:push():131] Uploaded file /tmp/tmp5bbx13axwandb/8bl0rtdu-wandb-metadata.json
38
+ 2024-08-04 14:40:09,112 INFO Thread-12 :11999 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_144007-dds6qqbt/files/requirements.txt
39
+ 2024-08-04 14:40:09,113 INFO Thread-12 :11999 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_144007-dds6qqbt/files/output.log
40
+ 2024-08-04 14:40:09,113 INFO Thread-12 :11999 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-metadata.json
41
+ 2024-08-04 14:40:10,756 DEBUG SenderThread:11999 [sender.py:send():382] send: config
42
+ 2024-08-04 14:40:10,756 DEBUG SenderThread:11999 [sender.py:send():382] send: config
43
+ 2024-08-04 14:40:10,842 DEBUG SenderThread:11999 [sender.py:send():382] send: exit
44
+ 2024-08-04 14:40:10,842 INFO SenderThread:11999 [sender.py:send_exit():589] handling exit code: 1
45
+ 2024-08-04 14:40:10,842 INFO SenderThread:11999 [sender.py:send_exit():591] handling runtime: 2
46
+ 2024-08-04 14:40:10,843 INFO SenderThread:11999 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
47
+ 2024-08-04 14:40:10,843 INFO SenderThread:11999 [sender.py:send_exit():597] send defer
48
+ 2024-08-04 14:40:10,844 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
49
+ 2024-08-04 14:40:10,844 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 0
50
+ 2024-08-04 14:40:10,844 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
51
+ 2024-08-04 14:40:10,844 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 0
52
+ 2024-08-04 14:40:10,844 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 1
53
+ 2024-08-04 14:40:10,844 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
54
+ 2024-08-04 14:40:10,844 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 1
55
+ 2024-08-04 14:40:10,844 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
56
+ 2024-08-04 14:40:10,844 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 1
57
+ 2024-08-04 14:40:10,844 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 2
58
+ 2024-08-04 14:40:10,844 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
59
+ 2024-08-04 14:40:10,844 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 2
60
+ 2024-08-04 14:40:10,844 INFO HandlerThread:11999 [system_monitor.py:finish():203] Stopping system monitor
61
+ 2024-08-04 14:40:10,845 DEBUG SystemMonitor:11999 [system_monitor.py:_start():172] Starting system metrics aggregation loop
62
+ 2024-08-04 14:40:10,845 INFO HandlerThread:11999 [interfaces.py:finish():202] Joined cpu monitor
63
+ 2024-08-04 14:40:10,845 DEBUG SystemMonitor:11999 [system_monitor.py:_start():179] Finished system metrics aggregation loop
64
+ 2024-08-04 14:40:10,845 INFO HandlerThread:11999 [interfaces.py:finish():202] Joined disk monitor
65
+ 2024-08-04 14:40:10,845 DEBUG SystemMonitor:11999 [system_monitor.py:_start():183] Publishing last batch of metrics
66
+ 2024-08-04 14:40:10,878 INFO HandlerThread:11999 [interfaces.py:finish():202] Joined gpu monitor
67
+ 2024-08-04 14:40:10,878 INFO HandlerThread:11999 [interfaces.py:finish():202] Joined memory monitor
68
+ 2024-08-04 14:40:10,878 INFO HandlerThread:11999 [interfaces.py:finish():202] Joined network monitor
69
+ 2024-08-04 14:40:10,878 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
70
+ 2024-08-04 14:40:10,878 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 2
71
+ 2024-08-04 14:40:10,878 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 3
72
+ 2024-08-04 14:40:10,879 DEBUG SenderThread:11999 [sender.py:send():382] send: stats
73
+ 2024-08-04 14:40:10,879 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
74
+ 2024-08-04 14:40:10,879 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 3
75
+ 2024-08-04 14:40:10,879 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
76
+ 2024-08-04 14:40:10,879 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 3
77
+ 2024-08-04 14:40:10,879 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 4
78
+ 2024-08-04 14:40:10,879 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
79
+ 2024-08-04 14:40:10,879 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 4
80
+ 2024-08-04 14:40:10,879 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
81
+ 2024-08-04 14:40:10,879 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 4
82
+ 2024-08-04 14:40:10,879 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 5
83
+ 2024-08-04 14:40:10,879 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
84
+ 2024-08-04 14:40:10,880 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 5
85
+ 2024-08-04 14:40:10,880 DEBUG SenderThread:11999 [sender.py:send():382] send: summary
86
+ 2024-08-04 14:40:10,881 INFO SenderThread:11999 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
87
+ 2024-08-04 14:40:10,881 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
88
+ 2024-08-04 14:40:10,881 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 5
89
+ 2024-08-04 14:40:10,881 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 6
90
+ 2024-08-04 14:40:10,881 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
91
+ 2024-08-04 14:40:10,881 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 6
92
+ 2024-08-04 14:40:10,881 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
93
+ 2024-08-04 14:40:10,881 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 6
94
+ 2024-08-04 14:40:10,884 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: status_report
95
+ 2024-08-04 14:40:11,083 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 7
96
+ 2024-08-04 14:40:11,083 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
97
+ 2024-08-04 14:40:11,083 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 7
98
+ 2024-08-04 14:40:11,083 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
99
+ 2024-08-04 14:40:11,083 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 7
100
+ 2024-08-04 14:40:11,113 INFO Thread-12 :11999 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_144007-dds6qqbt/files/output.log
101
+ 2024-08-04 14:40:11,114 INFO Thread-12 :11999 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_144007-dds6qqbt/files/config.yaml
102
+ 2024-08-04 14:40:11,114 INFO Thread-12 :11999 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json
103
+ 2024-08-04 14:40:11,842 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
104
+ 2024-08-04 14:40:12,953 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 8
105
+ 2024-08-04 14:40:12,953 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
106
+ 2024-08-04 14:40:12,953 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
107
+ 2024-08-04 14:40:12,954 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 8
108
+ 2024-08-04 14:40:12,954 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
109
+ 2024-08-04 14:40:12,954 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 8
110
+ 2024-08-04 14:40:12,954 INFO SenderThread:11999 [job_builder.py:build():296] Attempting to build job artifact
111
+ 2024-08-04 14:40:12,955 INFO SenderThread:11999 [job_builder.py:_get_source_type():426] is repo sourced job
112
+ 2024-08-04 14:40:12,969 INFO SenderThread:11999 [job_builder.py:build():402] adding wandb-job metadata file
113
+ 2024-08-04 14:40:12,987 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 9
114
+ 2024-08-04 14:40:12,987 DEBUG SenderThread:11999 [sender.py:send():382] send: artifact
115
+ 2024-08-04 14:40:12,988 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
116
+ 2024-08-04 14:40:12,989 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 9
117
+ 2024-08-04 14:40:13,115 INFO Thread-12 :11999 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_144007-dds6qqbt/files/output.log
118
+ 2024-08-04 14:40:13,842 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
119
+ 2024-08-04 14:40:13,848 INFO SenderThread:11999 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
120
+ 2024-08-04 14:40:13,848 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
121
+ 2024-08-04 14:40:13,848 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 9
122
+ 2024-08-04 14:40:13,848 INFO SenderThread:11999 [dir_watcher.py:finish():358] shutting down directory watcher
123
+ 2024-08-04 14:40:14,116 INFO SenderThread:11999 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_144007-dds6qqbt/files
124
+ 2024-08-04 14:40:14,116 INFO SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/requirements.txt requirements.txt
125
+ 2024-08-04 14:40:14,116 INFO SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/config.yaml config.yaml
126
+ 2024-08-04 14:40:14,118 INFO SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-metadata.json wandb-metadata.json
127
+ 2024-08-04 14:40:14,118 INFO SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json wandb-summary.json
128
+ 2024-08-04 14:40:14,119 INFO SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/output.log output.log
129
+ 2024-08-04 14:40:14,121 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 10
130
+ 2024-08-04 14:40:14,121 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
131
+ 2024-08-04 14:40:14,121 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
132
+ 2024-08-04 14:40:14,121 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 10
133
+ 2024-08-04 14:40:14,123 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
134
+ 2024-08-04 14:40:14,123 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 10
135
+ 2024-08-04 14:40:14,123 INFO SenderThread:11999 [file_pusher.py:finish():172] shutting down file pusher
136
+ 2024-08-04 14:40:14,515 INFO wandb-upload_0:11999 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_144007-dds6qqbt/files/requirements.txt
137
+ 2024-08-04 14:40:14,617 INFO wandb-upload_1:11999 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_144007-dds6qqbt/files/config.yaml
138
+ 2024-08-04 14:40:14,698 INFO wandb-upload_2:11999 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json
139
+ 2024-08-04 14:40:14,843 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
140
+ 2024-08-04 14:40:14,843 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
141
+ 2024-08-04 14:40:15,184 INFO wandb-upload_3:11999 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_144007-dds6qqbt/files/output.log
142
+ 2024-08-04 14:40:15,384 INFO Thread-11 (_thread_body):11999 [sender.py:transition_state():617] send defer: 11
143
+ 2024-08-04 14:40:15,384 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
144
+ 2024-08-04 14:40:15,385 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 11
145
+ 2024-08-04 14:40:15,385 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
146
+ 2024-08-04 14:40:15,385 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 11
147
+ 2024-08-04 14:40:15,385 INFO SenderThread:11999 [file_pusher.py:join():178] waiting for file pusher
148
+ 2024-08-04 14:40:15,385 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 12
149
+ 2024-08-04 14:40:15,385 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
150
+ 2024-08-04 14:40:15,385 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 12
151
+ 2024-08-04 14:40:15,385 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
152
+ 2024-08-04 14:40:15,386 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 12
153
+ 2024-08-04 14:40:15,386 INFO SenderThread:11999 [file_stream.py:finish():595] file stream finish called
154
+ 2024-08-04 14:40:15,573 INFO SenderThread:11999 [file_stream.py:finish():599] file stream finish is done
155
+ 2024-08-04 14:40:15,573 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 13
156
+ 2024-08-04 14:40:15,573 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
157
+ 2024-08-04 14:40:15,573 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 13
158
+ 2024-08-04 14:40:15,573 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
159
+ 2024-08-04 14:40:15,574 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 13
160
+ 2024-08-04 14:40:15,574 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 14
161
+ 2024-08-04 14:40:15,574 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
162
+ 2024-08-04 14:40:15,574 DEBUG SenderThread:11999 [sender.py:send():382] send: final
163
+ 2024-08-04 14:40:15,574 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 14
164
+ 2024-08-04 14:40:15,574 DEBUG SenderThread:11999 [sender.py:send():382] send: footer
165
+ 2024-08-04 14:40:15,574 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
166
+ 2024-08-04 14:40:15,574 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 14
167
+ 2024-08-04 14:40:15,575 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
168
+ 2024-08-04 14:40:15,575 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
169
+ 2024-08-04 14:40:15,575 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
170
+ 2024-08-04 14:40:15,576 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
171
+ 2024-08-04 14:40:15,576 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: server_info
172
+ 2024-08-04 14:40:15,576 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: get_summary
173
+ 2024-08-04 14:40:15,576 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: server_info
174
+ 2024-08-04 14:40:15,578 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: sampled_history
175
+ 2024-08-04 14:40:15,578 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: internal_messages
176
+ 2024-08-04 14:40:15,578 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: job_info
177
+ 2024-08-04 14:40:15,734 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: job_info
178
+ 2024-08-04 14:40:15,735 INFO MainThread:11999 [wandb_run.py:_footer_history_summary_info():3866] rendering history
179
+ 2024-08-04 14:40:15,735 INFO MainThread:11999 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
180
+ 2024-08-04 14:40:15,735 INFO MainThread:11999 [wandb_run.py:_footer_sync_info():3825] logging synced files
181
+ 2024-08-04 14:40:15,735 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: shutdown
182
+ 2024-08-04 14:40:15,735 INFO HandlerThread:11999 [handler.py:finish():869] shutting down handler
183
+ 2024-08-04 14:40:16,578 INFO WriterThread:11999 [datastore.py:close():296] close: /project/wandb/run-20240804_144007-dds6qqbt/run-dds6qqbt.wandb
184
+ 2024-08-04 14:40:16,735 INFO SenderThread:11999 [sender.py:finish():1572] shutting down sender
185
+ 2024-08-04 14:40:16,735 INFO SenderThread:11999 [file_pusher.py:finish():172] shutting down file pusher
186
+ 2024-08-04 14:40:16,735 INFO SenderThread:11999 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240804_144007-dds6qqbt/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 14:40:07,600 INFO MainThread:11928 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_setup.py:_flush():76] Configure stats pid to 11928
3
+ 2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
6
+ 2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_144007-dds6qqbt/logs/debug.log
9
+ 2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_144007-dds6qqbt/logs/debug-internal.log
10
+ 2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:39:57', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
13
+ 2024-08-04 14:40:07,602 INFO MainThread:11928 [wandb_init.py:init():616] starting backend
14
+ 2024-08-04 14:40:07,602 INFO MainThread:11928 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-04 14:40:07,606 INFO MainThread:11928 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-04 14:40:07,607 INFO MainThread:11928 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-04 14:40:07,612 INFO MainThread:11928 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-04 14:40:07,623 INFO MainThread:11928 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-04 14:40:08,115 INFO MainThread:11928 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-04 14:40:08,197 INFO MainThread:11928 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-04 14:40:08,197 INFO MainThread:11928 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-04 14:40:08,254 INFO MainThread:11928 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-04 14:40:08,254 INFO MainThread:11928 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-04 14:40:08,254 INFO MainThread:11928 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-04 14:40:08,255 INFO MainThread:11928 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-04 14:40:08,255 INFO MainThread:11928 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-04 14:40:10,755 INFO MainThread:11928 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
29
+ 2024-08-04 14:40:10,755 INFO MainThread:11928 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-04 14:40:16,736 WARNING MsgRouterThr:11928 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240804_144007-dds6qqbt/run-dds6qqbt.wandb ADDED
Binary file (20.5 kB). View file
 
wandb/run-20240804_222226-kh5katc1/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '235289369'
31
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '235289369'
36
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '235289369'
41
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 4096
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: HFPreTrainedTokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/google/gemma-2-2b
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: yans-sample-gemma-2-2b_train_2024-08-04-22:22:15
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/yans-sample-gemma-2-2b
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/yans-sample-gemma-2-2b
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/google/gemma-2-2b
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: anyprecision
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 1
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/yans-sample-gemma-2-2b
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 256000
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 320
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722777746.267116
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: gelu_pytorch_tanh
318
+ hidden_size:
319
+ desc: null
320
+ value: 2304
321
+ model_type:
322
+ desc: null
323
+ value: gemma2
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 4096
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 8
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 26
333
+ model_architecture:
334
+ desc: null
335
+ value: Gemma2ForCausalLM
wandb/run-20240804_222226-kh5katc1/files/output.log ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/yans-sample-gemma-2-2b.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
8
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
9
+
10
+ Loading checkpoint shards: 67%|██████▋ | 2/3 [00:03<00:01, 1.62s/it]
11
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
12
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
13
+ No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping model loading
14
+ --> Model /share/pretrained_lm/google/gemma-2-2b
15
+ --> /share/pretrained_lm/google/gemma-2-2b has 2614.341888 Million params
16
+ BFloat16 enabled for mixed precision - using bfSixteen policy
17
+ --> applying fsdp activation checkpointing...
18
+ > datasets target sizes (minimum size):
19
+ train: 6400000
20
+ validation: 323200
21
+ test: 3200
22
+ > building train, validation, and test datasets for GPT ...
23
+ > finished creating GPT datasets ...
24
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
25
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
26
+ No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping optimizer loading
27
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
28
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
29
+ model info: FullyShardedDataParallel(
30
+ (_fsdp_wrapped_module): Gemma2ForCausalLM(
31
+ (model): Gemma2Model(
32
+ (embed_tokens): Embedding(256000, 2304, padding_idx=0)
33
+ (layers): ModuleList(
34
+ (0-25): 26 x FullyShardedDataParallel(
35
+ (_fsdp_wrapped_module): CheckpointWrapper(
36
+ (_checkpoint_wrapped_module): Gemma2DecoderLayer(
37
+ (self_attn): Gemma2FlashAttention2(
38
+ (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
39
+ (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
40
+ (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
41
+ (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
42
+ (rotary_emb): Gemma2RotaryEmbedding()
43
+ )
44
+ (mlp): Gemma2MLP(
45
+ (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
46
+ (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
47
+ (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
48
+ (act_fn): PytorchGELUTanh()
49
+ )
50
+ (input_layernorm): Gemma2RMSNorm()
51
+ (post_attention_layernorm): Gemma2RMSNorm()
52
+ (pre_feedforward_layernorm): Gemma2RMSNorm()
53
+ (post_feedforward_layernorm): Gemma2RMSNorm()
54
+ )
55
+ )
56
+ )
57
+ )
58
+ (norm): Gemma2RMSNorm()
59
+ )
60
+ (lm_head): Linear(in_features=2304, out_features=256000, bias=False)
61
+ )
62
+ )
63
+ model config: Gemma2Config {
64
+ "_name_or_path": "/share/pretrained_lm/google/gemma-2-2b",
65
+ "architectures": [
66
+ "Gemma2ForCausalLM"
67
+ ],
68
+ "attention_bias": false,
69
+ "attention_dropout": 0.0,
70
+ "attn_logit_softcapping": 50.0,
71
+ "bos_token_id": 2,
72
+ "cache_implementation": "hybrid",
73
+ "eos_token_id": 1,
74
+ "final_logit_softcapping": 30.0,
75
+ "head_dim": 256,
76
+ "hidden_act": "gelu_pytorch_tanh",
77
+ "hidden_activation": "gelu_pytorch_tanh",
78
+ "hidden_size": 2304,
79
+ "initializer_range": 0.02,
80
+ "intermediate_size": 9216,
81
+ "label_smoothing": 0.0,
82
+ "max_position_embeddings": 4096,
83
+ "model_type": "gemma2",
84
+ "num_attention_heads": 8,
85
+ "num_hidden_layers": 26,
86
+ "num_key_value_heads": 4,
87
+ "pad_token_id": 0,
88
+ "query_pre_attn_scalar": 256,
89
+ "rms_norm_eps": 1e-06,
90
+ "rope_theta": 10000.0,
91
+ "sliding_window": 4096,
92
+ "torch_dtype": "float32",
93
+ "transformers_version": "4.43.3",
94
+ "use_cache": false,
95
+ "vocab_size": 256000
96
+ Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00, 1.16s/it]
97
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
98
+ warnings.warn(
99
+ Let split = None
100
+ Building a BlendedDataset for a single MegatronDataset
101
+ Unable to save the indexes because path_to_cache is None
102
+ Building a BlendedDataset for a single MegatronDataset
103
+ Unable to save the indexes because path_to_cache is None
104
+ Building a BlendedDataset for a single MegatronDataset
105
+ Unable to save the indexes because path_to_cache is None
106
+ It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
107
+ ------------------------------------------------------------------
108
+ iteration: 1 , TFLOPS: 86.75197547568487, Tokens per sec: 5563.411303067021, Loss: 4.171908378601074
109
+ ------------------------------------------------------------------
110
+ ------------------------------------------------------------------
111
+ iteration: 2 , TFLOPS: 66.89933870537911, Tokens per sec: 4290.26007857923, Loss: 4.01677942276001
112
+ ------------------------------------------------------------------
113
+ ------------------------------------------------------------------
114
+ iteration: 3 , TFLOPS: 67.16726893555325, Tokens per sec: 4307.442466217215, Loss: 3.9401252269744873
115
+ ------------------------------------------------------------------
116
+ ------------------------------------------------------------------
117
+ iteration: 4 , TFLOPS: 67.25290490347041, Tokens per sec: 4312.934307864013, Loss: 3.754024028778076
118
+ ------------------------------------------------------------------
119
+ ------------------------------------------------------------------
120
+ iteration: 5 , TFLOPS: 67.291445985822, Tokens per sec: 4315.405950636545, Loss: 3.8183631896972656
121
+ ------------------------------------------------------------------
122
+ ------------------------------------------------------------------
123
+ iteration: 6 , TFLOPS: 67.19993814817916, Tokens per sec: 4309.537545502599, Loss: 3.913503408432007
124
+ ------------------------------------------------------------------
125
+ ------------------------------------------------------------------
126
+ iteration: 7 , TFLOPS: 67.30122810400093, Tokens per sec: 4316.033278677735, Loss: 3.851064682006836
127
+ ------------------------------------------------------------------
128
+ ------------------------------------------------------------------
129
+ iteration: 8 , TFLOPS: 67.16795653479824, Tokens per sec: 4307.486562013197, Loss: 3.6646127700805664
130
+ ------------------------------------------------------------------
131
+ ------------------------------------------------------------------
132
+ iteration: 9 , TFLOPS: 67.23016958415664, Tokens per sec: 4311.4762899715615, Loss: 3.7966654300689697
133
+ ------------------------------------------------------------------
134
+ ------------------------------------------------------------------
135
+ iteration: 10 , TFLOPS: 67.23271391538408, Tokens per sec: 4311.639458141876, Loss: 3.5526936054229736
136
+ ------------------------------------------------------------------
137
+ ------------------------------------------------------------------
138
+ iteration: 11 , TFLOPS: 67.17798338980677, Tokens per sec: 4308.129585047344, Loss: 3.6002132892608643
139
+ ------------------------------------------------------------------
140
+ ------------------------------------------------------------------
141
+ iteration: 12 , TFLOPS: 67.30360350705875, Tokens per sec: 4316.185613470676, Loss: 3.5705204010009766
142
+ ------------------------------------------------------------------
143
+ ------------------------------------------------------------------
144
+ iteration: 13 , TFLOPS: 67.13811947997524, Tokens per sec: 4305.573109240019, Loss: 3.5447990894317627
145
+ ------------------------------------------------------------------
146
+ ------------------------------------------------------------------
147
+ iteration: 14 , TFLOPS: 67.15854019228757, Tokens per sec: 4306.882691195614, Loss: 3.450416088104248
148
+ ------------------------------------------------------------------
149
+ ------------------------------------------------------------------
150
+ iteration: 15 , TFLOPS: 67.19845754951105, Tokens per sec: 4309.442594588568, Loss: 3.443570613861084
151
+ ------------------------------------------------------------------
152
+ ------------------------------------------------------------------
153
+ iteration: 16 , TFLOPS: 67.23455541812397, Tokens per sec: 4311.757553863634, Loss: 3.3366641998291016
154
+ ------------------------------------------------------------------
155
+ ------------------------------------------------------------------
156
+ iteration: 17 , TFLOPS: 67.30688001895524, Tokens per sec: 4316.395736447352, Loss: 3.332282066345215
157
+ ------------------------------------------------------------------
158
+ ------------------------------------------------------------------
159
+ iteration: 18 , TFLOPS: 67.36120902746241, Tokens per sec: 4319.879860219242, Loss: 3.34403395652771
160
+ ------------------------------------------------------------------
161
+ ------------------------------------------------------------------
162
+ iteration: 19 , TFLOPS: 67.26840440584516, Tokens per sec: 4313.928292222649, Loss: 3.256293773651123
163
+ ------------------------------------------------------------------
164
+ ------------------------------------------------------------------
165
+ iteration: 20 , TFLOPS: 67.17348341042366, Tokens per sec: 4307.8410010003945, Loss: 3.3122451305389404
166
+ ------------------------------------------------------------------
167
+ ------------------------------------------------------------------
168
+ iteration: 21 , TFLOPS: 67.2001168793811, Tokens per sec: 4309.54900754924, Loss: 3.2204227447509766
169
+ ------------------------------------------------------------------
170
+ ------------------------------------------------------------------
171
+ iteration: 22 , TFLOPS: 67.23699865533753, Tokens per sec: 4311.914238866545, Loss: 3.2488620281219482
172
+ ------------------------------------------------------------------
173
+ ------------------------------------------------------------------
174
+ iteration: 23 , TFLOPS: 67.2425865851171, Tokens per sec: 4312.272593261658, Loss: 3.163287401199341
175
+ ------------------------------------------------------------------
176
+ ------------------------------------------------------------------
177
+ iteration: 24 , TFLOPS: 67.21941753377986, Tokens per sec: 4310.786760098965, Loss: 3.2160401344299316
178
+ ------------------------------------------------------------------
179
+ ------------------------------------------------------------------
180
+ iteration: 25 , TFLOPS: 67.09871135713247, Tokens per sec: 4303.04586308967, Loss: 3.0935139656066895
181
+ ------------------------------------------------------------------
182
+ ------------------------------------------------------------------
183
+ iteration: 26 , TFLOPS: 67.20080576079224, Tokens per sec: 4309.593185570642, Loss: 3.047175168991089
184
+ ------------------------------------------------------------------
185
+ ------------------------------------------------------------------
186
+ iteration: 27 , TFLOPS: 67.27441115034365, Tokens per sec: 4314.313505240039, Loss: 3.0304696559906006
187
+ ------------------------------------------------------------------
188
+ ------------------------------------------------------------------
189
+ iteration: 28 , TFLOPS: 67.26365793583362, Tokens per sec: 4313.623900711482, Loss: 3.0319135189056396
190
+ ------------------------------------------------------------------
191
+ ------------------------------------------------------------------
192
+ iteration: 29 , TFLOPS: 67.16464708688589, Tokens per sec: 4307.27432684712, Loss: 2.959254264831543
193
+ ------------------------------------------------------------------
194
+ ------------------------------------------------------------------
195
+ iteration: 30 , TFLOPS: 67.3000542568793, Tokens per sec: 4315.957999765541, Loss: 2.913499116897583
196
+ ------------------------------------------------------------------
197
+ ------------------------------------------------------------------
198
+ iteration: 31 , TFLOPS: 67.18211917043104, Tokens per sec: 4308.3948129980145, Loss: 2.940014362335205
199
+ ------------------------------------------------------------------
200
+ ------------------------------------------------------------------
201
+ iteration: 32 , TFLOPS: 67.25841762372463, Tokens per sec: 4313.287839066096, Loss: 2.8469998836517334
202
+ ------------------------------------------------------------------
203
+ ------------------------------------------------------------------
204
+ iteration: 33 , TFLOPS: 67.33731321073192, Tokens per sec: 4318.347419532266, Loss: 2.829812526702881
205
+ ------------------------------------------------------------------
206
+ ------------------------------------------------------------------
207
+ iteration: 34 , TFLOPS: 67.24161982046462, Tokens per sec: 4312.210594565195, Loss: 2.8521993160247803
208
+ ------------------------------------------------------------------
209
+ ------------------------------------------------------------------
210
+ iteration: 35 , TFLOPS: 67.24248740627992, Tokens per sec: 4312.266232914695, Loss: 2.8338708877563477
211
+ ------------------------------------------------------------------
212
+ ------------------------------------------------------------------
213
+ iteration: 36 , TFLOPS: 67.24777489174788, Tokens per sec: 4312.60531979146, Loss: 2.787545680999756
214
+ ------------------------------------------------------------------
215
+ ------------------------------------------------------------------
216
+ iteration: 37 , TFLOPS: 67.30205154448893, Tokens per sec: 4316.086085983773, Loss: 2.81471848487854
217
+ ------------------------------------------------------------------
218
+ ------------------------------------------------------------------
219
+ iteration: 38 , TFLOPS: 67.13737290861587, Tokens per sec: 4305.525231557506, Loss: 2.7764387130737305
220
+ ------------------------------------------------------------------
221
+ ------------------------------------------------------------------
222
+ iteration: 39 , TFLOPS: 67.22735358248879, Tokens per sec: 4311.295699553621, Loss: 2.7642412185668945
223
+ ------------------------------------------------------------------
224
+ ------------------------------------------------------------------
225
+ iteration: 40 , TFLOPS: 67.26715109677696, Tokens per sec: 4313.847917409303, Loss: 2.7132599353790283
226
+ ------------------------------------------------------------------
227
+ ------------------------------------------------------------------
228
+ iteration: 41 , TFLOPS: 67.23918606123682, Tokens per sec: 4312.054517386288, Loss: 2.668989896774292
229
+ ------------------------------------------------------------------
230
+ ------------------------------------------------------------------
231
+ iteration: 42 , TFLOPS: 67.13128246048267, Tokens per sec: 4305.134650619155, Loss: 2.6973328590393066
232
+ ------------------------------------------------------------------
233
+ ------------------------------------------------------------------
234
+ iteration: 43 , TFLOPS: 67.23091373690416, Tokens per sec: 4311.524012548299, Loss: 2.685912609100342
235
+ ------------------------------------------------------------------
236
+ ------------------------------------------------------------------
237
+ iteration: 44 , TFLOPS: 67.27693115124784, Tokens per sec: 4314.475113104727, Loss: 2.662001371383667
238
+ ------------------------------------------------------------------
239
+ ------------------------------------------------------------------
240
+ iteration: 45 , TFLOPS: 67.27965002709941, Tokens per sec: 4314.649474836105, Loss: 2.6665873527526855
241
+ ------------------------------------------------------------------
242
+ ------------------------------------------------------------------
243
+ iteration: 46 , TFLOPS: 67.15514015419501, Tokens per sec: 4306.664646473851, Loss: 2.6501307487487793
244
+ ------------------------------------------------------------------
245
+ ------------------------------------------------------------------
246
+ iteration: 47 , TFLOPS: 67.2760527329066, Tokens per sec: 4314.418780064453, Loss: 2.6316823959350586
247
+ ------------------------------------------------------------------
248
+ ------------------------------------------------------------------
249
+ iteration: 48 , TFLOPS: 67.25548187637087, Tokens per sec: 4313.099569347494, Loss: 2.6278648376464844
250
+ ------------------------------------------------------------------
251
+ ------------------------------------------------------------------
252
+ iteration: 49 , TFLOPS: 67.35263957774154, Tokens per sec: 4319.330300705736, Loss: 2.6157166957855225
253
+ ------------------------------------------------------------------
254
+ ------------------------------------------------------------------
255
+ iteration: 50 , TFLOPS: 67.32408825677271, Tokens per sec: 4317.499302150089, Loss: 2.5965774059295654
256
+ ------------------------------------------------------------------
257
+ ------------------------------------------------------------------
258
+ iteration: 51 , TFLOPS: 67.1953666892378, Tokens per sec: 4309.244377465717, Loss: 2.578054904937744
259
+ ------------------------------------------------------------------
260
+ ------------------------------------------------------------------
261
+ iteration: 52 , TFLOPS: 67.25156682148656, Tokens per sec: 4312.848496556634, Loss: 2.5468966960906982
262
+ ------------------------------------------------------------------
263
+ ------------------------------------------------------------------
264
+ iteration: 53 , TFLOPS: 67.32404734871982, Tokens per sec: 4317.496678713301, Loss: 2.53428316116333
265
+ ------------------------------------------------------------------
266
+ ------------------------------------------------------------------
267
+ iteration: 54 , TFLOPS: 67.15867426285547, Tokens per sec: 4306.89128915213, Loss: 2.545722246170044
268
+ ------------------------------------------------------------------
269
+ ------------------------------------------------------------------
270
+ iteration: 55 , TFLOPS: 67.27601676163123, Tokens per sec: 4314.416473223611, Loss: 2.5279200077056885
271
+ ------------------------------------------------------------------
272
+ ------------------------------------------------------------------
273
+ iteration: 56 , TFLOPS: 67.19740155918589, Tokens per sec: 4309.374873842397, Loss: 2.534917116165161
274
+ ------------------------------------------------------------------
275
+ ------------------------------------------------------------------
276
+ iteration: 57 , TFLOPS: 67.2461120484207, Tokens per sec: 4312.498681512492, Loss: 2.5658233165740967
277
+ ------------------------------------------------------------------
278
+ ------------------------------------------------------------------
279
+ iteration: 58 , TFLOPS: 67.2920938769174, Tokens per sec: 4315.447499945635, Loss: 2.5472288131713867
280
+ ------------------------------------------------------------------
281
+ ------------------------------------------------------------------
282
+ iteration: 59 , TFLOPS: 67.27804058384706, Tokens per sec: 4314.546261108317, Loss: 2.4994900226593018
283
+ ------------------------------------------------------------------
284
+ ------------------------------------------------------------------
285
+ iteration: 60 , TFLOPS: 67.28150855801171, Tokens per sec: 4314.768662575956, Loss: 2.502976417541504
286
+ ------------------------------------------------------------------
287
+ ------------------------------------------------------------------
288
+ iteration: 61 , TFLOPS: 67.3506410671317, Tokens per sec: 4319.2021360563995, Loss: 2.5281176567077637
289
+ ------------------------------------------------------------------
290
+ ------------------------------------------------------------------
291
+ iteration: 62 , TFLOPS: 67.23894764547772, Tokens per sec: 4312.039227764101, Loss: 2.514285087585449
292
+ ------------------------------------------------------------------
293
+ ------------------------------------------------------------------
294
+ iteration: 63 , TFLOPS: 67.26110814707724, Tokens per sec: 4313.460382549388, Loss: 2.482907772064209
295
+ ------------------------------------------------------------------
296
+ ------------------------------------------------------------------
297
+ iteration: 64 , TFLOPS: 67.16648997644158, Tokens per sec: 4307.39251150549, Loss: 2.4810938835144043
298
+ ------------------------------------------------------------------
299
+ ------------------------------------------------------------------
300
+ iteration: 65 , TFLOPS: 67.13380749324574, Tokens per sec: 4305.2965811773665, Loss: 2.4889049530029297
301
+ ------------------------------------------------------------------
302
+ ------------------------------------------------------------------
303
+ iteration: 66 , TFLOPS: 67.29568135916668, Tokens per sec: 4315.677565476544, Loss: 2.4739832878112793
304
+ ------------------------------------------------------------------
305
+ ------------------------------------------------------------------
306
+ iteration: 67 , TFLOPS: 67.2353824902874, Tokens per sec: 4311.810594069316, Loss: 2.4979248046875
307
+ ------------------------------------------------------------------
308
+ ------------------------------------------------------------------
309
+ iteration: 68 , TFLOPS: 67.16737608801321, Tokens per sec: 4307.449337913261, Loss: 2.4705636501312256
310
+ ------------------------------------------------------------------
311
+ ------------------------------------------------------------------
312
+ iteration: 69 , TFLOPS: 67.17368447741053, Tokens per sec: 4307.853895442756, Loss: 2.431494951248169
313
+ ------------------------------------------------------------------
314
+ ------------------------------------------------------------------
315
+ iteration: 70 , TFLOPS: 67.27513003078525, Tokens per sec: 4314.3596071017255, Loss: 2.4638864994049072
316
+ ------------------------------------------------------------------
317
+ ------------------------------------------------------------------
318
+ iteration: 71 , TFLOPS: 67.13314091760232, Tokens per sec: 4305.253833626679, Loss: 2.4194881916046143
319
+ ------------------------------------------------------------------
320
+ ------------------------------------------------------------------
321
+ iteration: 72 , TFLOPS: 67.35945536468331, Tokens per sec: 4319.767397681375, Loss: 2.4741766452789307
322
+ ------------------------------------------------------------------
323
+ ------------------------------------------------------------------
324
+ iteration: 73 , TFLOPS: 67.22132247798172, Tokens per sec: 4310.908924326882, Loss: 2.438474416732788
325
+ ------------------------------------------------------------------
326
+ ------------------------------------------------------------------
327
+ iteration: 74 , TFLOPS: 67.20619442505729, Tokens per sec: 4309.9387610519625, Loss: 2.466714859008789
328
+ ------------------------------------------------------------------
329
+ ------------------------------------------------------------------
330
+ iteration: 75 , TFLOPS: 67.2254479385552, Tokens per sec: 4311.17349045185, Loss: 2.4174747467041016
331
+ ------------------------------------------------------------------
332
+ ------------------------------------------------------------------
333
+ iteration: 76 , TFLOPS: 67.24521841222351, Tokens per sec: 4312.441372549867, Loss: 2.424267053604126
334
+ ------------------------------------------------------------------
335
+ ------------------------------------------------------------------
336
+ iteration: 77 , TFLOPS: 67.22922395995721, Tokens per sec: 4311.415647014088, Loss: 2.404212474822998
337
+ ------------------------------------------------------------------
338
+ ------------------------------------------------------------------
339
+ iteration: 78 , TFLOPS: 67.23452652330809, Tokens per sec: 4311.755700836721, Loss: 2.450658082962036
340
+ ------------------------------------------------------------------
341
+ ------------------------------------------------------------------
342
+ iteration: 79 , TFLOPS: 67.0846114872016, Tokens per sec: 4302.141637274464, Loss: 2.4231417179107666
343
+ ------------------------------------------------------------------
344
+ ------------------------------------------------------------------
345
+ iteration: 80 , TFLOPS: 67.17704276320255, Tokens per sec: 4308.069262586061, Loss: 2.413994312286377
346
+ ------------------------------------------------------------------
347
+ ------------------------------------------------------------------
348
+ iteration: 81 , TFLOPS: 67.2345689529718, Tokens per sec: 4311.758421854535, Loss: 2.4133667945861816
349
+ ------------------------------------------------------------------
350
+ ------------------------------------------------------------------
351
+ iteration: 82 , TFLOPS: 67.18505033340458, Tokens per sec: 4308.582788719936, Loss: 2.389362335205078
352
+ ------------------------------------------------------------------
353
+ ------------------------------------------------------------------
354
+ iteration: 83 , TFLOPS: 67.28162310992364, Tokens per sec: 4314.776008799464, Loss: 2.4374401569366455
355
+ ------------------------------------------------------------------
356
+ ------------------------------------------------------------------
357
+ iteration: 84 , TFLOPS: 67.2334157092426, Tokens per sec: 4311.684464239587, Loss: 2.3909661769866943
358
+ ------------------------------------------------------------------
359
+ ------------------------------------------------------------------
360
+ iteration: 85 , TFLOPS: 67.31368056601009, Tokens per sec: 4316.831856087792, Loss: 2.411787748336792
361
+ ------------------------------------------------------------------
362
+ ------------------------------------------------------------------
363
+ iteration: 86 , TFLOPS: 67.11865914241415, Tokens per sec: 4304.325116195997, Loss: 2.4398515224456787
364
+ ------------------------------------------------------------------
365
+ ------------------------------------------------------------------
366
+ iteration: 87 , TFLOPS: 67.24083693352927, Tokens per sec: 4312.160387961816, Loss: 2.3902275562286377
367
+ ------------------------------------------------------------------
368
+ ------------------------------------------------------------------
369
+ iteration: 88 , TFLOPS: 67.3222851144248, Tokens per sec: 4317.383666483415, Loss: 2.3877973556518555
370
+ ------------------------------------------------------------------
371
+ ------------------------------------------------------------------
372
+ iteration: 89 , TFLOPS: 67.14511488288893, Tokens per sec: 4306.021725002672, Loss: 2.376176357269287
373
+ ------------------------------------------------------------------
374
+ ------------------------------------------------------------------
375
+ iteration: 90 , TFLOPS: 67.29125521000229, Tokens per sec: 4315.3937161675785, Loss: 2.3973848819732666
376
+ ------------------------------------------------------------------
377
+ ------------------------------------------------------------------
378
+ iteration: 91 , TFLOPS: 67.1356528047859, Tokens per sec: 4305.414921157799, Loss: 2.388991355895996
379
+ ------------------------------------------------------------------
380
+ ------------------------------------------------------------------
381
+ iteration: 92 , TFLOPS: 67.25754211457983, Tokens per sec: 4313.231692592827, Loss: 2.383312463760376
382
+ ------------------------------------------------------------------
383
+ ------------------------------------------------------------------
384
+ iteration: 93 , TFLOPS: 67.15498729921683, Tokens per sec: 4306.654843871562, Loss: 2.3923604488372803
385
+ ------------------------------------------------------------------
386
+ ------------------------------------------------------------------
387
+ iteration: 94 , TFLOPS: 67.32478814446085, Tokens per sec: 4317.544186004938, Loss: 2.3716728687286377
388
+ ------------------------------------------------------------------
389
+ ------------------------------------------------------------------
390
+ iteration: 95 , TFLOPS: 67.3161465459375, Tokens per sec: 4316.989999582809, Loss: 2.405150890350342
391
+ ------------------------------------------------------------------
392
+ ------------------------------------------------------------------
393
+ iteration: 96 , TFLOPS: 67.20162737067454, Tokens per sec: 4309.645875479786, Loss: 2.365361213684082
394
+ ------------------------------------------------------------------
395
+ ------------------------------------------------------------------
396
+ iteration: 97 , TFLOPS: 67.17173577081181, Tokens per sec: 4307.728924728738, Loss: 2.3839645385742188
397
+ ------------------------------------------------------------------
398
+ ------------------------------------------------------------------
399
+ iteration: 98 , TFLOPS: 67.20004987934048, Tokens per sec: 4309.544710831139, Loss: 2.3723373413085938
400
+ ------------------------------------------------------------------
401
+ ------------------------------------------------------------------
402
+ iteration: 99 , TFLOPS: 67.30991336059388, Tokens per sec: 4316.590264895447, Loss: 2.3913819789886475
403
+ ------------------------------------------------------------------
404
+ ------------------------------------------------------------------
405
+ iteration: 100 , TFLOPS: 67.23987549288418, Tokens per sec: 4312.098730694383, Loss: 2.3768458366394043
406
+ ------------------------------------------------------------------
407
+ ------------------------------------------------------------------
408
+ iteration: 101 , TFLOPS: 67.33907694033823, Tokens per sec: 4318.460527656589, Loss: 2.3836305141448975
409
+ ------------------------------------------------------------------
410
+ ------------------------------------------------------------------
411
+ iteration: 102 , TFLOPS: 67.30975607840512, Tokens per sec: 4316.580178375781, Loss: 2.3950178623199463
412
+ ------------------------------------------------------------------
413
+ ------------------------------------------------------------------
414
+ iteration: 103 , TFLOPS: 67.1982354002556, Tokens per sec: 4309.428348138593, Loss: 2.361278772354126
415
+ ------------------------------------------------------------------
416
+ ------------------------------------------------------------------
417
+ iteration: 104 , TFLOPS: 67.20376894334782, Tokens per sec: 4309.783214710986, Loss: 2.3559556007385254
418
+ ------------------------------------------------------------------
419
+ ------------------------------------------------------------------
420
+ iteration: 105 , TFLOPS: 67.23013357946196, Tokens per sec: 4311.47398098754, Loss: 2.349632740020752
421
+ ------------------------------------------------------------------
422
+ ------------------------------------------------------------------
423
+ iteration: 106 , TFLOPS: 67.23129147862021, Tokens per sec: 4311.548237155534, Loss: 2.379448652267456
424
+ ------------------------------------------------------------------
425
+ ------------------------------------------------------------------
426
+ iteration: 107 , TFLOPS: 67.16429762559119, Tokens per sec: 4307.251915865627, Loss: 2.4072415828704834
427
+ ------------------------------------------------------------------
428
+ ------------------------------------------------------------------
429
+ iteration: 108 , TFLOPS: 67.26025670890765, Tokens per sec: 4313.405779749734, Loss: 2.3945987224578857
430
+ ------------------------------------------------------------------
431
+ ------------------------------------------------------------------
432
+ iteration: 109 , TFLOPS: 67.13558642664209, Tokens per sec: 4305.410664321992, Loss: 2.3535115718841553
433
+ ------------------------------------------------------------------
434
+ ------------------------------------------------------------------
435
+ iteration: 110 , TFLOPS: 67.27982379366702, Tokens per sec: 4314.660618500338, Loss: 2.3627665042877197
436
+ ------------------------------------------------------------------
437
+ ------------------------------------------------------------------
438
+ iteration: 111 , TFLOPS: 67.26391532288811, Tokens per sec: 4313.640406964398, Loss: 2.3859591484069824
439
+ ------------------------------------------------------------------
440
+ ------------------------------------------------------------------
441
+ iteration: 112 , TFLOPS: 67.27053505647855, Tokens per sec: 4314.064931022535, Loss: 2.3465442657470703
442
+ ------------------------------------------------------------------
443
+ ------------------------------------------------------------------
444
+ iteration: 113 , TFLOPS: 67.22654753561278, Tokens per sec: 4311.244007701346, Loss: 2.396284818649292
445
+ ------------------------------------------------------------------
446
+ ------------------------------------------------------------------
447
+ iteration: 114 , TFLOPS: 67.12289176484347, Tokens per sec: 4304.596554619569, Loss: 2.3716585636138916
448
+ ------------------------------------------------------------------
449
+ ------------------------------------------------------------------
450
+ iteration: 115 , TFLOPS: 67.13262769476694, Tokens per sec: 4305.220920604149, Loss: 2.3369154930114746
451
+ ------------------------------------------------------------------
452
+ ------------------------------------------------------------------
453
+ iteration: 116 , TFLOPS: 67.17146478693049, Tokens per sec: 4307.711546510201, Loss: 2.302396535873413
454
+ ------------------------------------------------------------------
455
+ Traceback (most recent call last):
456
+ File "/project/examples/finetuning.py", line 13, in <module>
457
+ main()
458
+ File "/project/src/llama_recipes/finetuning.py", line 281, in main
459
+ train(
460
+ File "/project/src/llama_recipes/utils/train_utils.py", line 118, in train
461
+ loss.backward()
462
+ File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
463
+ torch.autograd.backward(
464
+ File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 267, in backward
465
+ _engine_run_backward(
466
+ File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 681, in _engine_run_backward
467
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
468
+ KeyboardInterrupt
wandb/run-20240804_222226-kh5katc1/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240804_222226-kh5katc1/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-04T13:22:26.872566",
5
+ "startedAt": "2024-08-04T13:22:26.250232",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "4096",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "1",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "HFPreTrainedTokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/google/gemma-2-2b",
23
+ "--train-data-path",
24
+ "235289369",
25
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
26
+ "--valid-data-path",
27
+ "235289369",
28
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
29
+ "--test-data-path",
30
+ "235289369",
31
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "anyprecision",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/google/gemma-2-2b",
64
+ "--save",
65
+ "/work/llm_recipes/models/yans-sample-gemma-2-2b",
66
+ "--load",
67
+ "/work/llm_recipes/models/yans-sample-gemma-2-2b",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/yans-sample-gemma-2-2b",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "yans-sample-gemma-2-2b_train_2024-08-04-22:22:15"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "0336bd6c20fe25d78eda1d14afa66c1ae2e6d687"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.044999999999,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.045,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.045,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.045,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.045,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.045,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.045,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.045,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.045,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.045,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.045,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.045,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.045,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.045,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.045,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.045,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.045,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.045,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.045,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48782730102539
214
+ }
215
+ }
wandb/run-20240804_222226-kh5katc1/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"training/loss": 2.302396535873413, "training/perplexity": 9.99811460655144, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 116, "optimizer/lr": 5.4080000000000006e-06, "optimizer/variance_l2": 0.0030219239359304895, "optimizer/variance_sqrt_l2": 0.8405880490942215, "optimizer/momentum_l2": 0.36270596473675665, "optimizer/weight_l2": 1167.8420269882395, "optimizer/variance_l1": 0.70648193359375, "optimizer/variance_sqrt_l1": 19948.0, "optimizer/momentum_l1": 5862.0, "optimizer/weight_l1": 29775872.0, "optimizer/variance_abs_max": 0.001068115234375, "optimizer/variance_sqrt_abs_max": 0.03271484375, "optimizer/momentum_abs_max": 0.0250244140625, "optimizer/weight_abs_max": 12.9375, "stats/1_iteration_time": 304.34721216700564, "stats/tokens_per_sec": 4307.711546510201, "stats/tokens_per_sec_per_gpu": 4307.711546510201, "stats/tflops": 67.17146478693049, "_timestamp": 1722812960.6351748, "_runtime": 35214.36805868149, "_step": 116, "_wandb": {"runtime": 35371}}
wandb/run-20240804_222226-kh5katc1/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240804_222226-kh5katc1/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 22:22:26,260 INFO MainThread:12896 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_setup.py:_flush():76] Configure stats pid to 12896
3
+ 2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
6
+ 2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_222226-kh5katc1/logs/debug.log
9
+ 2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_222226-kh5katc1/logs/debug-internal.log
10
+ 2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/google/gemma-2-2b', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-gemma-2-2b_train_2024-08-04-22:22:15', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'save': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'base_model': '/share/pretrained_lm/google/gemma-2-2b', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-gemma-2-2b', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 256000, 'gradient_accumulation_steps': 320}
13
+ 2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_init.py:init():616] starting backend
14
+ 2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-04 22:22:26,266 INFO MainThread:12896 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-04 22:22:26,266 INFO MainThread:12896 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-04 22:22:26,271 INFO MainThread:12896 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-04 22:22:26,282 INFO MainThread:12896 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-04 22:22:26,766 INFO MainThread:12896 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-04 22:22:26,847 INFO MainThread:12896 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-04 22:22:26,847 INFO MainThread:12896 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-04 22:22:26,902 INFO MainThread:12896 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-04 22:22:26,902 INFO MainThread:12896 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-04 22:22:26,903 INFO MainThread:12896 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-04 22:22:26,903 INFO MainThread:12896 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-04 22:22:26,903 INFO MainThread:12896 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-04 22:22:32,202 INFO MainThread:12896 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'gelu_pytorch_tanh', 'hidden_size': 2304, 'model_type': 'gemma2', 'max_position_embeddings': 4096, 'num_attention_heads': 8, 'num_hidden_layers': 26, 'model_architecture': 'Gemma2ForCausalLM'}
29
+ 2024-08-04 22:22:32,203 INFO MainThread:12896 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-05 08:12:06,481 WARNING MsgRouterThr:12896 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240812_063447-whqmtxyq/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '235289369'
31
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '235289369'
36
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '235289369'
41
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 1021
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: HFPreTrainedTokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/google/gemma-2-2b
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: yans-sample-gemma-2-2b_train_2024-08-12-06:34:36
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/yans-sample-gemma-2-2b
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/yans-sample-gemma-2-2b
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/google/gemma-2-2b
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 3
138
+ save_interval:
139
+ desc: null
140
+ value: 3
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: anyprecision
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 1
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/yans-sample-gemma-2-2b
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 256000
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 320
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1723412087.358797
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ model_architecture:
316
+ desc: null
317
+ value: Gemma2ForCausalLM
318
+ activation_function:
319
+ desc: null
320
+ value: gelu_pytorch_tanh
321
+ hidden_size:
322
+ desc: null
323
+ value: 2304
324
+ model_type:
325
+ desc: null
326
+ value: gemma2
327
+ max_position_embeddings:
328
+ desc: null
329
+ value: 1021
330
+ num_attention_heads:
331
+ desc: null
332
+ value: 8
333
+ num_hidden_layers:
334
+ desc: null
335
+ value: 26
wandb/run-20240812_063447-whqmtxyq/files/output.log ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/yans-sample-gemma-2-2b.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
8
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
9
+
10
+ Loading checkpoint shards: 33%|███▎ | 1/3 [00:01<00:03, 1.92s/it]
11
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
12
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
13
+ No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping model loading
14
+ --> Model /share/pretrained_lm/google/gemma-2-2b
15
+ --> /share/pretrained_lm/google/gemma-2-2b has 2614.341888 Million params
16
+ Loading checkpoint shards: 100%|██████████| 3/3 [01:18<00:00, 26.21s/it]
17
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
18
+ warnings.warn(
19
+ Let split = None
20
+ --> applying fsdp activation checkpointing...
21
+ > datasets target sizes (minimum size):
22
+ train: 6400000
23
+ validation: 21334400
24
+ test: 3200
25
+ > building train, validation, and test datasets for GPT ...
26
+ > finished creating GPT datasets ...
27
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
28
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
29
+ No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping optimizer loading
30
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
31
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
32
+ model info: FullyShardedDataParallel(
33
+ (_fsdp_wrapped_module): Gemma2ForCausalLM(
34
+ (model): Gemma2Model(
35
+ (embed_tokens): Embedding(256000, 2304, padding_idx=0)
36
+ (layers): ModuleList(
37
+ (0-25): 26 x FullyShardedDataParallel(
38
+ (_fsdp_wrapped_module): CheckpointWrapper(
39
+ (_checkpoint_wrapped_module): Gemma2DecoderLayer(
40
+ (self_attn): Gemma2FlashAttention2(
41
+ (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
42
+ (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
43
+ (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
44
+ (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
45
+ (rotary_emb): Gemma2RotaryEmbedding()
46
+ )
47
+ (mlp): Gemma2MLP(
48
+ (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
49
+ (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
50
+ (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
51
+ (act_fn): PytorchGELUTanh()
52
+ )
53
+ (input_layernorm): Gemma2RMSNorm()
54
+ (post_attention_layernorm): Gemma2RMSNorm()
55
+ (pre_feedforward_layernorm): Gemma2RMSNorm()
56
+ (post_feedforward_layernorm): Gemma2RMSNorm()
57
+ )
58
+ )
59
+ )
60
+ )
61
+ (norm): Gemma2RMSNorm()
62
+ )
63
+ (lm_head): Linear(in_features=2304, out_features=256000, bias=False)
64
+ )
65
+ )
66
+ model config: Gemma2Config {
67
+ "_name_or_path": "/share/pretrained_lm/google/gemma-2-2b",
68
+ "architectures": [
69
+ "Gemma2ForCausalLM"
70
+ ],
71
+ "attention_bias": false,
72
+ "attention_dropout": 0.0,
73
+ "attn_logit_softcapping": 50.0,
74
+ "bos_token_id": 2,
75
+ "cache_implementation": "hybrid",
76
+ "eos_token_id": 1,
77
+ "final_logit_softcapping": 30.0,
78
+ "head_dim": 256,
79
+ "hidden_act": "gelu_pytorch_tanh",
80
+ "hidden_activation": "gelu_pytorch_tanh",
81
+ "hidden_size": 2304,
82
+ "initializer_range": 0.02,
83
+ "intermediate_size": 9216,
84
+ "label_smoothing": 0.0,
85
+ "max_position_embeddings": 1021,
86
+ "model_type": "gemma2",
87
+ Building a BlendedDataset for a single MegatronDataset
88
+ Unable to save the indexes because path_to_cache is None
89
+ Building a BlendedDataset for a single MegatronDataset
90
+ Unable to save the indexes because path_to_cache is None
91
+ Building a BlendedDataset for a single MegatronDataset
92
+ Unable to save the indexes because path_to_cache is None
93
+ It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
94
+ "num_attention_heads": 8,
95
+ "num_hidden_layers": 26,
96
+ "num_key_value_heads": 4,
97
+ "pad_token_id": 0,
98
+ "query_pre_attn_scalar": 256,
99
+ "rms_norm_eps": 1e-06,
100
+ "rope_theta": 10000.0,
101
+ "sliding_window": 4096,
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.43.3",
104
+ "use_cache": false,
105
+ "vocab_size": 256000
106
+ }
107
+ ------------------------------------------------------------------
108
+ iteration: 1 , TFLOPS: 52.56331460229552, Tokens per sec: 3927.6626762354495, Loss: 16.080825805664062
109
+ ------------------------------------------------------------------
110
+ ------------------------------------------------------------------
111
+ iteration: 2 , TFLOPS: 52.356892101499724, Tokens per sec: 3912.238269345489, Loss: 15.729490280151367
112
+ ------------------------------------------------------------------
113
+ ------------------------------------------------------------------
114
+ iteration: 3 , TFLOPS: 52.39645244456057, Tokens per sec: 3915.194317381553, Loss: 15.54540729522705
115
+ ------------------------------------------------------------------
116
+ eval ppl=4948606.5, eval loss=15.414616584777832
117
+ Saving checkpoint to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003
118
+ Saving model state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/model.pt
119
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
120
+ warnings.warn(
121
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
122
+ warnings.warn(
123
+ Saved model state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/model.pt
124
+ Saving optimizer state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/optimizer.pt
125
+ [rank0]:[2024-08-12 06:40:35,335] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.008401250000133587, 'preprocessing_with_comm': 0.0009138020004684222, 'state_converting': 5.079375774000255, <Type.ALL: 'all'>: 5.090390497000044})
126
+ Saved optimizer state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/optimizer.pt
127
+ Traceback (most recent call last):
128
+ File "/project/examples/finetuning.py", line 13, in <module>
129
+ main()
130
+ File "/project/src/llama_recipes/finetuning.py", line 281, in main
131
+ train(
132
+ File "/project/src/llama_recipes/utils/train_utils.py", line 175, in train
133
+ save_checkpoint(
134
+ File "/project/src/llama_recipes/utils/checkpoint.py", line 168, in save_checkpoint
135
+ tokenizer.tokenizer.save_pretrained(tokenizer_path)
136
+ File "/project/lib/transformers/src/transformers/tokenization_utils_base.py", line 2622, in save_pretrained
137
+ if os.path.isfile(save_directory):
138
+ File "/usr/lib/python3.10/genericpath.py", line 30, in isfile
139
+ st = os.stat(path)
140
+ TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType
141
+ Saving scheduler state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/scheduler.pt
142
+ Saved scheduler state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/scheduler.pt
143
+ Saving RNG states to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/rng.pt
144
+ Saved RNG states to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/rng.pt
wandb/run-20240812_063447-whqmtxyq/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240812_063447-whqmtxyq/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-11T21:34:47.942238",
5
+ "startedAt": "2024-08-11T21:34:47.345817",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "1021",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "1",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "HFPreTrainedTokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/google/gemma-2-2b",
23
+ "--train-data-path",
24
+ "235289369",
25
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
26
+ "--valid-data-path",
27
+ "235289369",
28
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
29
+ "--test-data-path",
30
+ "235289369",
31
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "anyprecision",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "3",
56
+ "--eval-interval",
57
+ "3",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/google/gemma-2-2b",
64
+ "--save",
65
+ "/work/llm_recipes/models/yans-sample-gemma-2-2b",
66
+ "--load",
67
+ "/work/llm_recipes/models/yans-sample-gemma-2-2b",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/yans-sample-gemma-2-2b",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "yans-sample-gemma-2-2b_train_2024-08-12-06:34:36"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0429999999997,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.043,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.043,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.043,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.043,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.043,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.043,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.043,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.043,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.043,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.043,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.043,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.043,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.043,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.043,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.043,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.043,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.043,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.043,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.487823486328125
214
+ }
215
+ }
wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"training/loss": 15.54540729522705, "training/perplexity": 5640071.469138662, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 1022, "utils/gradient_accumulation_steps": 320, "utils/iteration": 3, "optimizer/lr": 1.114e-06, "optimizer/variance_l2": 0.0003583679885385243, "optimizer/variance_sqrt_l2": 0.3777214531330342, "optimizer/momentum_l2": 0.26258589724268894, "optimizer/weight_l2": 1167.8420269882395, "optimizer/variance_l1": 0.14256858825683594, "optimizer/variance_sqrt_l1": 5085.8125, "optimizer/momentum_l1": 3147.65625, "optimizer/weight_l1": 29773824.0, "optimizer/variance_abs_max": 7.009506225585938e-05, "optimizer/variance_sqrt_abs_max": 0.00836181640625, "optimizer/momentum_abs_max": 0.005950927734375, "optimizer/weight_abs_max": 12.9375, "stats/1_iteration_time": 83.53097534600056, "stats/tokens_per_sec": 3915.194317381553, "stats/tokens_per_sec_per_gpu": 3915.194317381553, "stats/tflops": 52.39645244456057, "_timestamp": 1723412421.3049276, "_runtime": 333.9461305141449, "_step": 3, "_wandb": {"runtime": 356}, "evaluation/val_loss": 15.414616584777832, "evaluation/val_ppl": 4948606.5}
wandb/run-20240812_063447-whqmtxyq/logs/debug-internal.log ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 06:34:47,360 INFO StreamThr :13101 [internal.py:wandb_internal():86] W&B internal server running at pid: 13101, started at: 2024-08-12 06:34:47.359620
2
+ 2024-08-12 06:34:47,362 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-12 06:34:47,363 INFO WriterThread:13101 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_063447-whqmtxyq/run-whqmtxyq.wandb
4
+ 2024-08-12 06:34:47,364 DEBUG SenderThread:13101 [sender.py:send():382] send: header
5
+ 2024-08-12 06:34:47,378 DEBUG SenderThread:13101 [sender.py:send():382] send: run
6
+ 2024-08-12 06:34:47,829 INFO SenderThread:13101 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_063447-whqmtxyq/files
7
+ 2024-08-12 06:34:47,829 INFO SenderThread:13101 [sender.py:_start_run_threads():1136] run started: whqmtxyq with start time 1723412087.358797
8
+ 2024-08-12 06:34:47,835 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-12 06:34:47,835 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-12 06:34:47,922 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-12 06:34:47,929 DEBUG HandlerThread:13101 [system_info.py:__init__():27] System info init
12
+ 2024-08-12 06:34:47,929 DEBUG HandlerThread:13101 [system_info.py:__init__():42] System info init done
13
+ 2024-08-12 06:34:47,929 INFO HandlerThread:13101 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-12 06:34:47,929 INFO SystemMonitor:13101 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-12 06:34:47,929 INFO HandlerThread:13101 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-12 06:34:47,930 INFO SystemMonitor:13101 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-12 06:34:47,930 INFO SystemMonitor:13101 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-12 06:34:47,930 INFO SystemMonitor:13101 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-12 06:34:47,931 INFO SystemMonitor:13101 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-12 06:34:47,932 INFO SystemMonitor:13101 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-12 06:34:47,942 DEBUG HandlerThread:13101 [system_info.py:probe():151] Probing system
22
+ 2024-08-12 06:34:47,944 DEBUG HandlerThread:13101 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-12 06:34:47,957 DEBUG HandlerThread:13101 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-12 06:34:47,957 DEBUG HandlerThread:13101 [system_info.py:probe():199] Probing system done
25
+ 2024-08-12 06:34:47,957 DEBUG HandlerThread:13101 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T21:34:47.942238', 'startedAt': '2024-08-11T21:34:47.345817', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1021', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/google/gemma-2-2b', '--train-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--valid-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--test-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '3', '--eval-interval', '3', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/google/gemma-2-2b', '--save', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--load', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-sample-gemma-2-2b', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-sample-gemma-2-2b_train_2024-08-12-06:34:36'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
26
+ 2024-08-12 06:34:47,957 INFO HandlerThread:13101 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-12 06:34:47,957 INFO HandlerThread:13101 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-12 06:34:47,958 INFO HandlerThread:13101 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-12 06:34:47,964 DEBUG SenderThread:13101 [sender.py:send():382] send: files
30
+ 2024-08-12 06:34:47,964 INFO SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-12 06:34:47,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-12 06:34:47,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: python_packages
33
+ 2024-08-12 06:34:47,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
34
+ 2024-08-12 06:34:47,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
35
+ 2024-08-12 06:34:47,976 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-12 06:34:48,281 DEBUG SenderThread:13101 [sender.py:send():382] send: telemetry
37
+ 2024-08-12 06:34:48,615 INFO wandb-upload_0:13101 [upload_job.py:push():131] Uploaded file /tmp/tmpxyme_qqmwandb/cck49p4b-wandb-metadata.json
38
+ 2024-08-12 06:34:48,831 INFO Thread-12 :13101 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
39
+ 2024-08-12 06:34:48,831 INFO Thread-12 :13101 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063447-whqmtxyq/files/requirements.txt
40
+ 2024-08-12 06:34:48,832 INFO Thread-12 :13101 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-metadata.json
41
+ 2024-08-12 06:34:50,832 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
42
+ 2024-08-12 06:34:52,543 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-08-12 06:34:52,833 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
44
+ 2024-08-12 06:34:57,543 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
45
+ 2024-08-12 06:35:02,544 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
46
+ 2024-08-12 06:35:02,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
47
+ 2024-08-12 06:35:02,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
48
+ 2024-08-12 06:35:02,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
49
+ 2024-08-12 06:35:08,234 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
50
+ 2024-08-12 06:35:13,235 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
51
+ 2024-08-12 06:35:17,973 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
52
+ 2024-08-12 06:35:17,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
53
+ 2024-08-12 06:35:18,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
54
+ 2024-08-12 06:35:18,247 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
55
+ 2024-08-12 06:35:18,849 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/config.yaml
56
+ 2024-08-12 06:35:23,452 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
57
+ 2024-08-12 06:35:28,453 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
58
+ 2024-08-12 06:35:32,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
59
+ 2024-08-12 06:35:32,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
60
+ 2024-08-12 06:35:33,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
61
+ 2024-08-12 06:35:34,202 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
62
+ 2024-08-12 06:35:39,202 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
63
+ 2024-08-12 06:35:44,203 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
64
+ 2024-08-12 06:35:47,932 DEBUG SystemMonitor:13101 [system_monitor.py:_start():172] Starting system metrics aggregation loop
65
+ 2024-08-12 06:35:47,934 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
66
+ 2024-08-12 06:35:47,973 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
67
+ 2024-08-12 06:35:47,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
68
+ 2024-08-12 06:35:48,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
69
+ 2024-08-12 06:35:49,237 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
70
+ 2024-08-12 06:35:54,238 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
71
+ 2024-08-12 06:35:59,239 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
72
+ 2024-08-12 06:36:02,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
73
+ 2024-08-12 06:36:02,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
74
+ 2024-08-12 06:36:03,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
75
+ 2024-08-12 06:36:05,234 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
76
+ 2024-08-12 06:36:08,884 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
77
+ 2024-08-12 06:36:09,915 DEBUG SenderThread:13101 [sender.py:send():382] send: config
78
+ 2024-08-12 06:36:09,915 DEBUG SenderThread:13101 [sender.py:send():382] send: config
79
+ 2024-08-12 06:36:10,885 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
80
+ 2024-08-12 06:36:11,119 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
81
+ 2024-08-12 06:36:16,120 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
82
+ 2024-08-12 06:36:17,935 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
83
+ 2024-08-12 06:36:17,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
84
+ 2024-08-12 06:36:17,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
85
+ 2024-08-12 06:36:17,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
86
+ 2024-08-12 06:36:21,237 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
87
+ 2024-08-12 06:36:21,893 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/config.yaml
88
+ 2024-08-12 06:36:26,451 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
89
+ 2024-08-12 06:36:31,452 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
90
+ 2024-08-12 06:36:32,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
91
+ 2024-08-12 06:36:32,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
92
+ 2024-08-12 06:36:33,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
93
+ 2024-08-12 06:36:37,174 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
94
+ 2024-08-12 06:36:42,174 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
95
+ 2024-08-12 06:36:47,175 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
96
+ 2024-08-12 06:36:47,936 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
97
+ 2024-08-12 06:36:47,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
98
+ 2024-08-12 06:36:47,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
99
+ 2024-08-12 06:36:48,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
100
+ 2024-08-12 06:36:52,199 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
101
+ 2024-08-12 06:36:57,199 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
102
+ 2024-08-12 06:37:02,200 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
103
+ 2024-08-12 06:37:02,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
104
+ 2024-08-12 06:37:02,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
105
+ 2024-08-12 06:37:03,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
106
+ 2024-08-12 06:37:08,177 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
107
+ 2024-08-12 06:37:13,178 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
108
+ 2024-08-12 06:37:17,937 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
109
+ 2024-08-12 06:37:17,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
110
+ 2024-08-12 06:37:17,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
111
+ 2024-08-12 06:37:18,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
112
+ 2024-08-12 06:37:18,239 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
113
+ 2024-08-12 06:37:23,240 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
114
+ 2024-08-12 06:37:28,240 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
115
+ 2024-08-12 06:37:32,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
116
+ 2024-08-12 06:37:32,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
117
+ 2024-08-12 06:37:33,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
118
+ 2024-08-12 06:37:33,471 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: partial_history
119
+ 2024-08-12 06:37:33,513 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
120
+ 2024-08-12 06:37:34,938 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
121
+ 2024-08-12 06:37:38,514 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
122
+ 2024-08-12 06:37:43,515 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
123
+ 2024-08-12 06:37:47,938 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
124
+ 2024-08-12 06:37:47,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
125
+ 2024-08-12 06:37:47,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
126
+ 2024-08-12 06:37:47,977 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
127
+ 2024-08-12 06:37:49,236 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
128
+ 2024-08-12 06:37:54,236 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
129
+ 2024-08-12 06:37:59,237 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
130
+ 2024-08-12 06:38:02,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
131
+ 2024-08-12 06:38:02,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
132
+ 2024-08-12 06:38:03,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
133
+ 2024-08-12 06:38:05,173 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
134
+ 2024-08-12 06:38:10,174 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
135
+ 2024-08-12 06:38:15,175 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
136
+ 2024-08-12 06:38:17,940 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
137
+ 2024-08-12 06:38:17,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
138
+ 2024-08-12 06:38:17,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
139
+ 2024-08-12 06:38:18,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
140
+ 2024-08-12 06:38:20,189 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
141
+ 2024-08-12 06:38:25,189 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
142
+ 2024-08-12 06:38:30,190 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
143
+ 2024-08-12 06:38:32,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
144
+ 2024-08-12 06:38:32,976 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
145
+ 2024-08-12 06:38:33,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
146
+ 2024-08-12 06:38:36,181 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
147
+ 2024-08-12 06:38:41,181 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
148
+ 2024-08-12 06:38:46,182 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
149
+ 2024-08-12 06:38:47,941 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
150
+ 2024-08-12 06:38:47,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
151
+ 2024-08-12 06:38:47,976 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
152
+ 2024-08-12 06:38:48,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
153
+ 2024-08-12 06:38:52,158 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
154
+ 2024-08-12 06:38:57,068 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: partial_history
155
+ 2024-08-12 06:38:57,070 DEBUG SenderThread:13101 [sender.py:send():382] send: history
156
+ 2024-08-12 06:38:57,071 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: summary_record
157
+ 2024-08-12 06:38:57,072 INFO SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
158
+ 2024-08-12 06:38:57,991 INFO Thread-12 :13101 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
159
+ 2024-08-12 06:38:58,109 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
160
+ 2024-08-12 06:38:58,991 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
161
+ 2024-08-12 06:39:02,976 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
162
+ 2024-08-12 06:39:02,977 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
163
+ 2024-08-12 06:39:02,977 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
164
+ 2024-08-12 06:39:03,220 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
165
+ 2024-08-12 06:39:08,220 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
166
+ 2024-08-12 06:39:13,221 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
167
+ 2024-08-12 06:39:17,942 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
168
+ 2024-08-12 06:39:17,976 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
169
+ 2024-08-12 06:39:17,977 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
170
+ 2024-08-12 06:39:18,020 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
171
+ 2024-08-12 06:39:19,166 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
172
+ 2024-08-12 06:39:24,167 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
173
+ 2024-08-12 06:39:29,167 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
174
+ 2024-08-12 06:39:32,976 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
175
+ 2024-08-12 06:39:32,976 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
176
+ 2024-08-12 06:39:33,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
177
+ 2024-08-12 06:39:34,262 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
178
+ 2024-08-12 06:39:39,263 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
179
+ 2024-08-12 06:39:44,264 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
180
+ 2024-08-12 06:39:47,943 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
181
+ 2024-08-12 06:39:47,976 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
182
+ 2024-08-12 06:39:47,976 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
183
+ 2024-08-12 06:39:48,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
184
+ 2024-08-12 06:39:50,213 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
185
+ 2024-08-12 06:39:55,214 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
186
+ 2024-08-12 06:40:00,215 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
187
+ 2024-08-12 06:40:02,976 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
188
+ 2024-08-12 06:40:02,977 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
189
+ 2024-08-12 06:40:03,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
190
+ 2024-08-12 06:40:05,253 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
191
+ 2024-08-12 06:40:10,254 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
192
+ 2024-08-12 06:40:15,254 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
193
+ 2024-08-12 06:40:17,944 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
194
+ 2024-08-12 06:40:17,976 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
195
+ 2024-08-12 06:40:17,977 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
196
+ 2024-08-12 06:40:18,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
197
+ 2024-08-12 06:40:20,601 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: partial_history
198
+ 2024-08-12 06:40:20,603 DEBUG SenderThread:13101 [sender.py:send():382] send: history
199
+ 2024-08-12 06:40:20,604 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: summary_record
200
+ 2024-08-12 06:40:20,604 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
201
+ 2024-08-12 06:40:20,605 INFO SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
202
+ 2024-08-12 06:40:21,044 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
203
+ 2024-08-12 06:40:21,045 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
204
+ 2024-08-12 06:40:21,305 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: partial_history
205
+ 2024-08-12 06:40:23,046 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
206
+ 2024-08-12 06:40:26,337 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
207
+ 2024-08-12 06:40:31,051 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
208
+ 2024-08-12 06:40:32,226 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
209
+ 2024-08-12 06:40:32,977 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
210
+ 2024-08-12 06:40:32,977 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
211
+ 2024-08-12 06:40:32,979 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
212
+ 2024-08-12 06:40:37,055 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
213
+ 2024-08-12 06:40:37,381 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
214
+ 2024-08-12 06:40:42,382 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
215
+ 2024-08-12 06:40:44,855 DEBUG SenderThread:13101 [sender.py:send():382] send: exit
216
+ 2024-08-12 06:40:44,856 INFO SenderThread:13101 [sender.py:send_exit():589] handling exit code: 1
217
+ 2024-08-12 06:40:44,856 INFO SenderThread:13101 [sender.py:send_exit():591] handling runtime: 356
218
+ 2024-08-12 06:40:44,857 INFO SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
219
+ 2024-08-12 06:40:44,857 INFO SenderThread:13101 [sender.py:send_exit():597] send defer
220
+ 2024-08-12 06:40:44,857 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
221
+ 2024-08-12 06:40:44,857 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 0
222
+ 2024-08-12 06:40:44,857 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
223
+ 2024-08-12 06:40:44,857 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 0
224
+ 2024-08-12 06:40:44,857 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 1
225
+ 2024-08-12 06:40:44,858 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
226
+ 2024-08-12 06:40:44,858 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 1
227
+ 2024-08-12 06:40:44,858 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
228
+ 2024-08-12 06:40:44,858 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 1
229
+ 2024-08-12 06:40:44,858 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 2
230
+ 2024-08-12 06:40:44,858 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
231
+ 2024-08-12 06:40:44,858 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 2
232
+ 2024-08-12 06:40:44,858 INFO HandlerThread:13101 [system_monitor.py:finish():203] Stopping system monitor
233
+ 2024-08-12 06:40:44,858 DEBUG SystemMonitor:13101 [system_monitor.py:_start():179] Finished system metrics aggregation loop
234
+ 2024-08-12 06:40:44,858 INFO HandlerThread:13101 [interfaces.py:finish():202] Joined cpu monitor
235
+ 2024-08-12 06:40:44,859 DEBUG SystemMonitor:13101 [system_monitor.py:_start():183] Publishing last batch of metrics
236
+ 2024-08-12 06:40:44,859 INFO HandlerThread:13101 [interfaces.py:finish():202] Joined disk monitor
237
+ 2024-08-12 06:40:44,893 INFO HandlerThread:13101 [interfaces.py:finish():202] Joined gpu monitor
238
+ 2024-08-12 06:40:44,893 INFO HandlerThread:13101 [interfaces.py:finish():202] Joined memory monitor
239
+ 2024-08-12 06:40:44,893 INFO HandlerThread:13101 [interfaces.py:finish():202] Joined network monitor
240
+ 2024-08-12 06:40:44,894 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
241
+ 2024-08-12 06:40:44,894 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 2
242
+ 2024-08-12 06:40:44,894 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 3
243
+ 2024-08-12 06:40:44,894 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
244
+ 2024-08-12 06:40:44,894 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
245
+ 2024-08-12 06:40:44,894 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 3
246
+ 2024-08-12 06:40:44,896 DEBUG SenderThread:13101 [sender.py:send():382] send: history
247
+ 2024-08-12 06:40:44,896 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: summary_record
248
+ 2024-08-12 06:40:44,897 INFO SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
249
+ 2024-08-12 06:40:44,898 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
250
+ 2024-08-12 06:40:44,898 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 3
251
+ 2024-08-12 06:40:44,898 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 4
252
+ 2024-08-12 06:40:44,898 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
253
+ 2024-08-12 06:40:44,898 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 4
254
+ 2024-08-12 06:40:44,898 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
255
+ 2024-08-12 06:40:44,898 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 4
256
+ 2024-08-12 06:40:44,898 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 5
257
+ 2024-08-12 06:40:44,898 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
258
+ 2024-08-12 06:40:44,898 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 5
259
+ 2024-08-12 06:40:44,899 DEBUG SenderThread:13101 [sender.py:send():382] send: summary
260
+ 2024-08-12 06:40:44,900 INFO SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
261
+ 2024-08-12 06:40:44,900 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
262
+ 2024-08-12 06:40:44,900 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 5
263
+ 2024-08-12 06:40:44,900 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 6
264
+ 2024-08-12 06:40:44,900 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
265
+ 2024-08-12 06:40:44,900 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 6
266
+ 2024-08-12 06:40:44,900 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
267
+ 2024-08-12 06:40:44,900 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 6
268
+ 2024-08-12 06:40:44,901 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 7
269
+ 2024-08-12 06:40:44,901 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
270
+ 2024-08-12 06:40:44,901 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
271
+ 2024-08-12 06:40:44,901 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 7
272
+ 2024-08-12 06:40:44,901 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
273
+ 2024-08-12 06:40:44,901 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 7
274
+ 2024-08-12 06:40:45,060 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
275
+ 2024-08-12 06:40:45,061 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
276
+ 2024-08-12 06:40:45,855 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
277
+ 2024-08-12 06:40:47,007 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 8
278
+ 2024-08-12 06:40:47,007 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
279
+ 2024-08-12 06:40:47,007 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
280
+ 2024-08-12 06:40:47,008 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 8
281
+ 2024-08-12 06:40:47,008 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
282
+ 2024-08-12 06:40:47,008 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 8
283
+ 2024-08-12 06:40:47,008 INFO SenderThread:13101 [job_builder.py:build():296] Attempting to build job artifact
284
+ 2024-08-12 06:40:47,009 INFO SenderThread:13101 [job_builder.py:_get_source_type():426] is repo sourced job
285
+ 2024-08-12 06:40:47,023 INFO SenderThread:13101 [job_builder.py:build():402] adding wandb-job metadata file
286
+ 2024-08-12 06:40:47,031 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 9
287
+ 2024-08-12 06:40:47,032 DEBUG SenderThread:13101 [sender.py:send():382] send: artifact
288
+ 2024-08-12 06:40:47,032 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
289
+ 2024-08-12 06:40:47,033 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 9
290
+ 2024-08-12 06:40:47,062 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
291
+ 2024-08-12 06:40:47,856 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
292
+ 2024-08-12 06:40:47,912 INFO SenderThread:13101 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
293
+ 2024-08-12 06:40:47,912 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
294
+ 2024-08-12 06:40:47,912 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 9
295
+ 2024-08-12 06:40:47,913 INFO SenderThread:13101 [dir_watcher.py:finish():358] shutting down directory watcher
296
+ 2024-08-12 06:40:48,063 INFO SenderThread:13101 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_063447-whqmtxyq/files
297
+ 2024-08-12 06:40:48,063 INFO SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/requirements.txt requirements.txt
298
+ 2024-08-12 06:40:48,063 INFO SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/config.yaml config.yaml
299
+ 2024-08-12 06:40:48,065 INFO SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-metadata.json wandb-metadata.json
300
+ 2024-08-12 06:40:48,065 INFO SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json wandb-summary.json
301
+ 2024-08-12 06:40:48,067 INFO SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log output.log
302
+ 2024-08-12 06:40:48,067 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 10
303
+ 2024-08-12 06:40:48,068 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
304
+ 2024-08-12 06:40:48,069 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
305
+ 2024-08-12 06:40:48,069 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 10
306
+ 2024-08-12 06:40:48,070 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
307
+ 2024-08-12 06:40:48,071 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 10
308
+ 2024-08-12 06:40:48,071 INFO SenderThread:13101 [file_pusher.py:finish():172] shutting down file pusher
309
+ 2024-08-12 06:40:48,555 INFO wandb-upload_1:13101 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063447-whqmtxyq/files/config.yaml
310
+ 2024-08-12 06:40:48,607 INFO wandb-upload_0:13101 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063447-whqmtxyq/files/requirements.txt
311
+ 2024-08-12 06:40:48,857 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
312
+ 2024-08-12 06:40:48,857 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
313
+ 2024-08-12 06:40:49,047 INFO wandb-upload_2:13101 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
314
+ 2024-08-12 06:40:49,065 INFO wandb-upload_3:13101 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
315
+ 2024-08-12 06:40:49,265 INFO Thread-11 (_thread_body):13101 [sender.py:transition_state():617] send defer: 11
316
+ 2024-08-12 06:40:49,266 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
317
+ 2024-08-12 06:40:49,266 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 11
318
+ 2024-08-12 06:40:49,266 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
319
+ 2024-08-12 06:40:49,266 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 11
320
+ 2024-08-12 06:40:49,266 INFO SenderThread:13101 [file_pusher.py:join():178] waiting for file pusher
321
+ 2024-08-12 06:40:49,266 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 12
322
+ 2024-08-12 06:40:49,267 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
323
+ 2024-08-12 06:40:49,267 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 12
324
+ 2024-08-12 06:40:49,267 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
325
+ 2024-08-12 06:40:49,267 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 12
326
+ 2024-08-12 06:40:49,267 INFO SenderThread:13101 [file_stream.py:finish():595] file stream finish called
327
+ 2024-08-12 06:40:49,435 INFO SenderThread:13101 [file_stream.py:finish():599] file stream finish is done
328
+ 2024-08-12 06:40:49,435 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 13
329
+ 2024-08-12 06:40:49,436 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
330
+ 2024-08-12 06:40:49,436 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 13
331
+ 2024-08-12 06:40:49,436 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
332
+ 2024-08-12 06:40:49,436 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 13
333
+ 2024-08-12 06:40:49,436 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 14
334
+ 2024-08-12 06:40:49,436 DEBUG SenderThread:13101 [sender.py:send():382] send: final
335
+ 2024-08-12 06:40:49,436 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
336
+ 2024-08-12 06:40:49,436 DEBUG SenderThread:13101 [sender.py:send():382] send: footer
337
+ 2024-08-12 06:40:49,436 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 14
338
+ 2024-08-12 06:40:49,437 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
339
+ 2024-08-12 06:40:49,437 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 14
340
+ 2024-08-12 06:40:49,437 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
341
+ 2024-08-12 06:40:49,437 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
342
+ 2024-08-12 06:40:49,438 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
343
+ 2024-08-12 06:40:49,438 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
344
+ 2024-08-12 06:40:49,438 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: server_info
345
+ 2024-08-12 06:40:49,438 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: server_info
346
+ 2024-08-12 06:40:49,439 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: get_summary
347
+ 2024-08-12 06:40:49,440 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: sampled_history
348
+ 2024-08-12 06:40:49,442 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
349
+ 2024-08-12 06:40:49,442 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: job_info
350
+ 2024-08-12 06:40:49,609 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: job_info
351
+ 2024-08-12 06:40:49,610 INFO MainThread:13101 [wandb_run.py:_footer_history_summary_info():3866] rendering history
352
+ 2024-08-12 06:40:49,610 INFO MainThread:13101 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
353
+ 2024-08-12 06:40:49,611 INFO MainThread:13101 [wandb_run.py:_footer_sync_info():3825] logging synced files
354
+ 2024-08-12 06:40:49,611 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: shutdown
355
+ 2024-08-12 06:40:49,611 INFO HandlerThread:13101 [handler.py:finish():869] shutting down handler
356
+ 2024-08-12 06:40:50,442 INFO WriterThread:13101 [datastore.py:close():296] close: /project/wandb/run-20240812_063447-whqmtxyq/run-whqmtxyq.wandb
357
+ 2024-08-12 06:40:50,610 INFO SenderThread:13101 [sender.py:finish():1572] shutting down sender
358
+ 2024-08-12 06:40:50,610 INFO SenderThread:13101 [file_pusher.py:finish():172] shutting down file pusher
359
+ 2024-08-12 06:40:50,610 INFO SenderThread:13101 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240812_063447-whqmtxyq/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 06:34:47,351 INFO MainThread:13030 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_setup.py:_flush():76] Configure stats pid to 13030
3
+ 2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
6
+ 2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_063447-whqmtxyq/logs/debug.log
9
+ 2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_063447-whqmtxyq/logs/debug-internal.log
10
+ 2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1021, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/google/gemma-2-2b', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-gemma-2-2b_train_2024-08-12-06:34:36', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'save': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'base_model': '/share/pretrained_lm/google/gemma-2-2b', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 3, 'save_interval': 3, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-gemma-2-2b', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 256000, 'gradient_accumulation_steps': 320}
13
+ 2024-08-12 06:34:47,353 INFO MainThread:13030 [wandb_init.py:init():616] starting backend
14
+ 2024-08-12 06:34:47,353 INFO MainThread:13030 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-12 06:34:47,357 INFO MainThread:13030 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-12 06:34:47,358 INFO MainThread:13030 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-12 06:34:47,363 INFO MainThread:13030 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-12 06:34:47,374 INFO MainThread:13030 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-12 06:34:47,834 INFO MainThread:13030 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-12 06:34:47,915 INFO MainThread:13030 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-12 06:34:47,915 INFO MainThread:13030 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-12 06:34:47,973 INFO MainThread:13030 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-12 06:34:47,973 INFO MainThread:13030 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-12 06:34:47,973 INFO MainThread:13030 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-12 06:34:47,974 INFO MainThread:13030 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-12 06:34:47,975 INFO MainThread:13030 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-12 06:36:09,914 INFO MainThread:13030 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Gemma2ForCausalLM', 'activation_function': 'gelu_pytorch_tanh', 'hidden_size': 2304, 'model_type': 'gemma2', 'max_position_embeddings': 1021, 'num_attention_heads': 8, 'num_hidden_layers': 26}
29
+ 2024-08-12 06:36:09,915 INFO MainThread:13030 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-12 06:40:50,612 WARNING MsgRouterThr:13030 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240812_063447-whqmtxyq/run-whqmtxyq.wandb ADDED
Binary file (42.3 kB). View file
 
wandb/run-20240815_031216-0szn78ph/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '304771887'
31
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '304771887'
36
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '304771887'
41
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 4096
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: HFPreTrainedTokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: yans-qwen2-0.5B_train_2024-08-15-03:11:59
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/yans-qwen2-0.5B
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/yans-qwen2-0.5B
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 10
138
+ save_interval:
139
+ desc: null
140
+ value: 10
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 1
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/yans-qwen2-0.5B
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 151680
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 320
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1723659136.24386
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ model_architecture:
316
+ desc: null
317
+ value: Qwen2ForCausalLM
318
+ activation_function:
319
+ desc: null
320
+ value: silu
321
+ hidden_size:
322
+ desc: null
323
+ value: 896
324
+ model_type:
325
+ desc: null
326
+ value: qwen2
327
+ max_position_embeddings:
328
+ desc: null
329
+ value: 4096
330
+ num_attention_heads:
331
+ desc: null
332
+ value: 14
333
+ num_hidden_layers:
334
+ desc: null
335
+ value: 24
wandb/run-20240815_031216-0szn78ph/files/output.log ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ Loading model state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/model.pt
5
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
6
+ Loaded model state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/model.pt
7
+ --> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
8
+ --> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
9
+ BFloat16 enabled for mixed precision - using bfSixteen policy
10
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
11
+ warnings.warn(
12
+ --> applying fsdp activation checkpointing...
13
+ > datasets target sizes (minimum size):
14
+ train: 6400000
15
+ validation: 6403200
16
+ test: 3200
17
+ > building train, validation, and test datasets for GPT ...
18
+ > finished creating GPT datasets ...
19
+ Loading optimizer state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/optimizer.pt
20
+ Let split = None
21
+ Building a BlendedDataset for a single MegatronDataset
22
+ Unable to save the indexes because path_to_cache is None
23
+ Building a BlendedDataset for a single MegatronDataset
24
+ Unable to save the indexes because path_to_cache is None
25
+ Building a BlendedDataset for a single MegatronDataset
26
+ Unable to save the indexes because path_to_cache is None
27
+ Loaded optimizer state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/optimizer.pt
28
+ model info: FullyShardedDataParallel(
29
+ (_fsdp_wrapped_module): Qwen2ForCausalLM(
30
+ (model): Qwen2Model(
31
+ (embed_tokens): Embedding(151936, 896)
32
+ (layers): ModuleList(
33
+ (0-23): 24 x FullyShardedDataParallel(
34
+ (_fsdp_wrapped_module): CheckpointWrapper(
35
+ (_checkpoint_wrapped_module): Qwen2DecoderLayer(
36
+ (self_attn): Qwen2FlashAttention2(
37
+ (q_proj): Linear(in_features=896, out_features=896, bias=True)
38
+ (k_proj): Linear(in_features=896, out_features=128, bias=True)
39
+ (v_proj): Linear(in_features=896, out_features=128, bias=True)
40
+ (o_proj): Linear(in_features=896, out_features=896, bias=False)
41
+ (rotary_emb): Qwen2RotaryEmbedding()
42
+ )
43
+ (mlp): Qwen2MLP(
44
+ (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
45
+ (up_proj): Linear(in_features=896, out_features=4864, bias=False)
46
+ (down_proj): Linear(in_features=4864, out_features=896, bias=False)
47
+ (act_fn): SiLU()
48
+ )
49
+ (input_layernorm): Qwen2RMSNorm()
50
+ (post_attention_layernorm): Qwen2RMSNorm()
51
+ )
52
+ )
53
+ )
54
+ )
55
+ (norm): Qwen2RMSNorm()
56
+ )
57
+ (lm_head): Linear(in_features=896, out_features=151936, bias=False)
58
+ )
59
+ )
60
+ model config: Qwen2Config {
61
+ "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
62
+ "architectures": [
63
+ "Qwen2ForCausalLM"
64
+ ],
65
+ "attention_dropout": 0.0,
66
+ "bos_token_id": 151643,
67
+ "eos_token_id": 151643,
68
+ "hidden_act": "silu",
69
+ "hidden_size": 896,
70
+ "initializer_range": 0.02,
71
+ "intermediate_size": 4864,
72
+ "label_smoothing": 0.0,
73
+ "max_position_embeddings": 4096,
74
+ "max_window_layers": 24,
75
+ "model_type": "qwen2",
76
+ "num_attention_heads": 14,
77
+ "num_hidden_layers": 24,
78
+ "num_key_value_heads": 2,
79
+ "rms_norm_eps": 1e-06,
80
+ "rope_theta": 1000000.0,
81
+ "sliding_window": null,
82
+ "tie_word_embeddings": true,
83
+ "torch_dtype": "bfloat16",
84
+ "transformers_version": "4.43.3",
85
+ "use_cache": false,
86
+ "use_sliding_window": false,
87
+ "vocab_size": 151936
88
+ }
89
+ [rank0]:[2024-08-15 03:12:42,940] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
90
+ ------------------------------------------------------------------
91
+ iteration: 1161 , TFLOPS: 67.46644597716896, Tokens per sec: 16778.56616965974, Loss: 2.442603349685669
92
+ ------------------------------------------------------------------
wandb/run-20240815_031216-0szn78ph/files/requirements.txt ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ chardet==5.2.0
23
+ charset-normalizer==3.3.2
24
+ click==8.1.7
25
+ cloudpathlib==0.16.0
26
+ cloudpickle==3.0.0
27
+ cmake==3.28.1
28
+ colorama==0.4.6
29
+ comm==0.2.1
30
+ confection==0.1.4
31
+ contourpy==1.2.0
32
+ cubinlinker==0.3.0+2.g405ac64
33
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
34
+ cudf==23.12.0
35
+ cugraph-dgl==23.12.0
36
+ cugraph-service-client==23.12.0
37
+ cugraph-service-server==23.12.0
38
+ cugraph==23.12.0
39
+ cuml==23.12.0
40
+ cupy-cuda12x==12.3.0
41
+ cycler==0.12.1
42
+ cymem==2.0.8
43
+ cython==3.0.8
44
+ dask-cuda==23.12.0
45
+ dask-cudf==23.12.0
46
+ dask==2023.11.0
47
+ dataproperty==1.0.1
48
+ datasets==2.20.0
49
+ debugpy==1.8.1
50
+ decorator==5.1.1
51
+ defusedxml==0.7.1
52
+ dill==0.3.8
53
+ distributed==2023.11.0
54
+ dm-tree==0.1.8
55
+ docker-pycreds==0.4.0
56
+ einops==0.7.0
57
+ evaluate==0.4.2
58
+ exceptiongroup==1.2.0
59
+ execnet==2.0.2
60
+ executing==2.0.1
61
+ expecttest==0.1.3
62
+ fastjsonschema==2.19.1
63
+ fastrlock==0.8.2
64
+ filelock==3.13.1
65
+ flash-attn==2.4.2
66
+ fonttools==4.48.1
67
+ frozenlist==1.4.1
68
+ fsspec==2023.12.2
69
+ gast==0.5.4
70
+ gitdb==4.0.11
71
+ gitpython==3.1.43
72
+ google-auth-oauthlib==0.4.6
73
+ google-auth==2.27.0
74
+ graphsurgeon==0.4.6
75
+ grpcio==1.60.1
76
+ huggingface-hub==0.24.5
77
+ hypothesis==5.35.1
78
+ idna==3.6
79
+ importlib-metadata==7.0.1
80
+ iniconfig==2.0.0
81
+ intel-openmp==2021.4.0
82
+ ipadic==1.0.0
83
+ ipykernel==6.29.2
84
+ ipython-genutils==0.2.0
85
+ ipython==8.21.0
86
+ jedi==0.19.1
87
+ jinja2==3.1.3
88
+ joblib==1.3.2
89
+ json5==0.9.14
90
+ jsonlines==4.0.0
91
+ jsonnet==0.19.1
92
+ jsonschema-specifications==2023.12.1
93
+ jsonschema==4.21.1
94
+ jupyter-client==8.6.0
95
+ jupyter-core==5.7.1
96
+ jupyter-tensorboard==0.2.0
97
+ jupyterlab-pygments==0.3.0
98
+ jupyterlab-server==1.2.0
99
+ jupyterlab==2.3.2
100
+ jupytext==1.16.1
101
+ kiwisolver==1.4.5
102
+ langcodes==3.3.0
103
+ lazy-loader==0.3
104
+ librosa==0.10.1
105
+ llvmlite==0.40.1
106
+ lm-eval==0.4.3
107
+ locket==1.0.0
108
+ logzero==1.7.0
109
+ lxml==5.2.2
110
+ markdown-it-py==3.0.0
111
+ markdown==3.5.2
112
+ markupsafe==2.1.4
113
+ matplotlib-inline==0.1.6
114
+ matplotlib==3.8.2
115
+ mbstrdecoder==1.1.3
116
+ mdit-py-plugins==0.4.0
117
+ mdurl==0.1.2
118
+ mecab-python3==1.0.6
119
+ mistune==3.0.2
120
+ mkl-devel==2021.1.1
121
+ mkl-include==2021.1.1
122
+ mkl==2021.1.1
123
+ mock==5.1.0
124
+ more-itertools==9.1.0
125
+ mpmath==1.3.0
126
+ msgpack==1.0.7
127
+ multidict==6.0.4
128
+ multiprocess==0.70.16
129
+ murmurhash==1.0.10
130
+ nbclient==0.9.0
131
+ nbconvert==7.16.0
132
+ nbformat==5.9.2
133
+ nest-asyncio==1.6.0
134
+ networkx==2.6.3
135
+ ninja==1.11.1.1
136
+ nltk==3.8.1
137
+ notebook==6.4.10
138
+ numba==0.57.1+1.g1ff679645
139
+ numexpr==2.10.1
140
+ numpy==1.24.4
141
+ nvfuser==0.1.4a0+d0bb811
142
+ nvidia-dali-cuda120==1.34.0
143
+ nvidia-pyindex==1.0.9
144
+ nvtx==0.2.5
145
+ oauthlib==3.2.2
146
+ onnx==1.15.0rc2
147
+ opencv==4.7.0
148
+ optree==0.10.0
149
+ packaging==23.2
150
+ pandas==1.5.3
151
+ pandocfilters==1.5.1
152
+ parso==0.8.3
153
+ partd==1.4.1
154
+ pathvalidate==3.2.0
155
+ peft==0.11.1
156
+ pexpect==4.9.0
157
+ pillow==10.2.0
158
+ pip==24.0
159
+ platformdirs==4.2.0
160
+ pluggy==1.4.0
161
+ ply==3.11
162
+ polygraphy==0.49.4
163
+ pooch==1.8.0
164
+ portalocker==2.10.1
165
+ preshed==3.0.9
166
+ prettytable==3.9.0
167
+ prometheus-client==0.19.0
168
+ prompt-toolkit==3.0.43
169
+ protobuf==4.24.4
170
+ psutil==5.9.4
171
+ ptxcompiler==0.8.1+2.g0d406d6
172
+ ptyprocess==0.7.0
173
+ pure-eval==0.2.2
174
+ pyarrow-hotfix==0.6
175
+ pyarrow==17.0.0
176
+ pyasn1-modules==0.3.0
177
+ pyasn1==0.5.1
178
+ pybind11-global==2.11.1
179
+ pybind11==2.11.1
180
+ pycocotools==2.0+nv0.8.0
181
+ pycparser==2.21
182
+ pydantic-core==2.16.2
183
+ pydantic==2.6.1
184
+ pygments==2.17.2
185
+ pylibcugraph==23.12.0
186
+ pylibcugraphops==23.12.0
187
+ pylibraft==23.12.0
188
+ pynvml==11.4.1
189
+ pyparsing==3.1.1
190
+ pytablewriter==1.2.0
191
+ pytest-flakefinder==1.1.0
192
+ pytest-rerunfailures==13.0
193
+ pytest-shard==0.1.2
194
+ pytest-xdist==3.5.0
195
+ pytest==8.0.0
196
+ python-dateutil==2.8.2
197
+ python-dotenv==1.0.0
198
+ python-hostlist==1.23.0
199
+ pytorch-quantization==2.1.2
200
+ pytz==2023.3.post1
201
+ pyyaml==6.0.1
202
+ pyzmq==25.1.2
203
+ raft-dask==23.12.0
204
+ rapids-dask-dependency==23.12.1
205
+ referencing==0.33.0
206
+ regex==2023.12.25
207
+ requests-oauthlib==1.3.1
208
+ requests==2.32.3
209
+ rich==13.7.0
210
+ rmm==23.12.0
211
+ rouge-score==0.1.2
212
+ rpds-py==0.17.1
213
+ rsa==4.9
214
+ sacrebleu==2.4.0
215
+ safetensors==0.4.3
216
+ scikit-learn==1.2.0
217
+ scipy==1.12.0
218
+ send2trash==1.8.2
219
+ sentencepiece==0.1.99
220
+ sentry-sdk==2.12.0
221
+ setproctitle==1.3.3
222
+ setuptools==68.2.2
223
+ six==1.16.0
224
+ smart-open==6.4.0
225
+ smmap==5.0.1
226
+ sortedcontainers==2.4.0
227
+ soundfile==0.12.1
228
+ soupsieve==2.5
229
+ soxr==0.3.7
230
+ spacy-legacy==3.0.12
231
+ spacy-loggers==1.0.5
232
+ spacy==3.7.2
233
+ sphinx-glpi-theme==0.6
234
+ sqlitedict==2.1.0
235
+ srsly==2.4.8
236
+ stack-data==0.6.3
237
+ sympy==1.12
238
+ tabledata==1.3.3
239
+ tabulate==0.9.0
240
+ tbb==2021.11.0
241
+ tblib==3.0.0
242
+ tcolorpy==0.1.6
243
+ tensorboard-data-server==0.6.1
244
+ tensorboard-plugin-wit==1.8.1
245
+ tensorboard==2.9.0
246
+ tensorrt==8.6.3
247
+ terminado==0.18.0
248
+ termplotlib==0.3.9
249
+ thinc==8.2.3
250
+ threadpoolctl==3.2.0
251
+ thriftpy2==0.4.17
252
+ tinycss2==1.2.1
253
+ tokenizers==0.19.1
254
+ toml==0.10.2
255
+ tomli==2.0.1
256
+ toolz==0.12.1
257
+ torch-tensorrt==2.3.0a0
258
+ torch==2.3.0a0+ebedce2
259
+ torchdata==0.7.1a0
260
+ torchtext==0.17.0a0
261
+ torchvision==0.18.0a0
262
+ tornado==6.4
263
+ tqdm-multiprocess==0.0.11
264
+ tqdm==4.66.5
265
+ traitlets==5.9.0
266
+ transformer-engine==1.3.0+5b90b7f
267
+ transformers==4.43.3
268
+ treelite-runtime==3.9.1
269
+ treelite==3.9.1
270
+ triton==2.2.0+e28a256
271
+ typepy==1.3.2
272
+ typer==0.9.0
273
+ types-dataclasses==0.6.6
274
+ typing-extensions==4.9.0
275
+ ucx-py==0.35.0
276
+ uff==0.6.9
277
+ ujson==5.8.0
278
+ urllib3==1.26.18
279
+ wandb==0.16.3
280
+ wasabi==1.1.2
281
+ wcwidth==0.2.13
282
+ weasel==0.3.4
283
+ webencodings==0.5.1
284
+ werkzeug==3.0.1
285
+ wheel==0.42.0
286
+ word2number==1.1
287
+ xdoctest==1.0.2
288
+ xgboost==1.7.6
289
+ xxhash==3.4.1
290
+ yarl==1.9.4
291
+ zict==3.0.0
292
+ zipp==3.17.0
293
+ zstandard==0.23.0
wandb/run-20240815_031216-0szn78ph/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-14T18:12:16.980997",
5
+ "startedAt": "2024-08-14T18:12:16.230100",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "4096",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "1",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "HFPreTrainedTokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
23
+ "--train-data-path",
24
+ "304771887",
25
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
26
+ "--valid-data-path",
27
+ "304771887",
28
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
29
+ "--test-data-path",
30
+ "304771887",
31
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "10",
56
+ "--eval-interval",
57
+ "10",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
64
+ "--save",
65
+ "/work/llm_recipes/models/yans-qwen2-0.5B",
66
+ "--load",
67
+ "/work/llm_recipes/models/yans-qwen2-0.5B",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/yans-qwen2-0.5B",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "yans-qwen2-0.5B_train_2024-08-15-03:11:59"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0389999999993,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.039,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48782730102539
214
+ }
215
+ }
wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 168}, "training/loss": 2.442603349685669, "training/perplexity": 11.502947992429535, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 1161, "optimizer/lr": 1.9946184158325198e-05, "optimizer/variance_l2": 0.004682497095771901, "optimizer/variance_sqrt_l2": 0.5343142380105511, "optimizer/momentum_l2": 0.12459250428605805, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.2849578857421875, "optimizer/variance_sqrt_l1": 4625.0, "optimizer/momentum_l1": 977.875, "optimizer/weight_l1": 6918144.0, "optimizer/variance_abs_max": 0.0030059814453125, "optimizer/variance_sqrt_abs_max": 0.054931640625, "optimizer/momentum_abs_max": 0.0108642578125, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 78.13778523999986, "stats/tokens_per_sec": 16778.56616965974, "stats/tokens_per_sec_per_gpu": 16778.56616965974, "stats/tflops": 67.46644597716896, "_timestamp": 1723659241.8232834, "_runtime": 105.57942342758179, "_step": 1161}
wandb/run-20240815_031216-0szn78ph/logs/debug-internal.log ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-15 03:12:16,244 INFO StreamThr :10026 [internal.py:wandb_internal():86] W&B internal server running at pid: 10026, started at: 2024-08-15 03:12:16.243481
2
+ 2024-08-15 03:12:16,245 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-15 03:12:16,248 INFO WriterThread:10026 [datastore.py:open_for_write():87] open: /project/wandb/run-20240815_031216-0szn78ph/run-0szn78ph.wandb
4
+ 2024-08-15 03:12:16,249 DEBUG SenderThread:10026 [sender.py:send():382] send: header
5
+ 2024-08-15 03:12:16,409 DEBUG SenderThread:10026 [sender.py:send():382] send: run
6
+ 2024-08-15 03:12:16,887 INFO SenderThread:10026 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240815_031216-0szn78ph/files
7
+ 2024-08-15 03:12:16,887 INFO SenderThread:10026 [sender.py:_start_run_threads():1136] run started: 0szn78ph with start time 1723659136.24386
8
+ 2024-08-15 03:12:16,892 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-15 03:12:16,892 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-15 03:12:16,962 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-15 03:12:16,969 DEBUG HandlerThread:10026 [system_info.py:__init__():27] System info init
12
+ 2024-08-15 03:12:16,969 DEBUG HandlerThread:10026 [system_info.py:__init__():42] System info init done
13
+ 2024-08-15 03:12:16,969 INFO HandlerThread:10026 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-15 03:12:16,969 INFO SystemMonitor:10026 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-15 03:12:16,969 INFO HandlerThread:10026 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-15 03:12:16,969 INFO SystemMonitor:10026 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-15 03:12:16,970 INFO SystemMonitor:10026 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-15 03:12:16,971 INFO SystemMonitor:10026 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-15 03:12:16,972 INFO SystemMonitor:10026 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-15 03:12:16,972 INFO SystemMonitor:10026 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-15 03:12:16,980 DEBUG HandlerThread:10026 [system_info.py:probe():151] Probing system
22
+ 2024-08-15 03:12:16,983 DEBUG HandlerThread:10026 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-15 03:12:16,995 DEBUG HandlerThread:10026 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-15 03:12:16,995 DEBUG HandlerThread:10026 [system_info.py:probe():199] Probing system done
25
+ 2024-08-15 03:12:16,995 DEBUG HandlerThread:10026 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-14T18:12:16.980997', 'startedAt': '2024-08-14T18:12:16.230100', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-15-03:11:59'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
26
+ 2024-08-15 03:12:16,995 INFO HandlerThread:10026 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-15 03:12:16,995 INFO HandlerThread:10026 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-15 03:12:16,997 INFO HandlerThread:10026 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-15 03:12:17,023 DEBUG SenderThread:10026 [sender.py:send():382] send: files
30
+ 2024-08-15 03:12:17,024 INFO SenderThread:10026 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-15 03:12:17,033 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-15 03:12:17,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
33
+ 2024-08-15 03:12:17,034 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: python_packages
34
+ 2024-08-15 03:12:17,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
35
+ 2024-08-15 03:12:17,036 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-15 03:12:17,320 DEBUG SenderThread:10026 [sender.py:send():382] send: telemetry
37
+ 2024-08-15 03:12:17,786 INFO wandb-upload_0:10026 [upload_job.py:push():131] Uploaded file /tmp/tmp2lpzau9swandb/2fbn8bzg-wandb-metadata.json
38
+ 2024-08-15 03:12:17,889 INFO Thread-12 :10026 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
39
+ 2024-08-15 03:12:17,889 INFO Thread-12 :10026 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_031216-0szn78ph/files/wandb-metadata.json
40
+ 2024-08-15 03:12:17,889 INFO Thread-12 :10026 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_031216-0szn78ph/files/requirements.txt
41
+ 2024-08-15 03:12:19,889 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
42
+ 2024-08-15 03:12:21,867 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-08-15 03:12:21,890 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
44
+ 2024-08-15 03:12:22,891 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
45
+ 2024-08-15 03:12:26,867 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
46
+ 2024-08-15 03:12:31,868 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
47
+ 2024-08-15 03:12:32,032 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
48
+ 2024-08-15 03:12:32,033 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
49
+ 2024-08-15 03:12:32,033 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
50
+ 2024-08-15 03:12:37,282 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
51
+ 2024-08-15 03:12:37,900 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
52
+ 2024-08-15 03:12:38,901 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
53
+ 2024-08-15 03:12:39,901 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
54
+ 2024-08-15 03:12:40,902 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
55
+ 2024-08-15 03:12:42,647 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
56
+ 2024-08-15 03:12:43,260 DEBUG SenderThread:10026 [sender.py:send():382] send: config
57
+ 2024-08-15 03:12:43,261 DEBUG SenderThread:10026 [sender.py:send():382] send: config
58
+ 2024-08-15 03:12:43,904 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
59
+ 2024-08-15 03:12:44,904 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
60
+ 2024-08-15 03:12:47,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
61
+ 2024-08-15 03:12:47,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
62
+ 2024-08-15 03:12:47,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
63
+ 2024-08-15 03:12:48,218 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
64
+ 2024-08-15 03:12:48,907 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/config.yaml
65
+ 2024-08-15 03:12:53,411 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
66
+ 2024-08-15 03:12:58,411 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
67
+ 2024-08-15 03:13:02,035 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
68
+ 2024-08-15 03:13:02,035 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
69
+ 2024-08-15 03:13:02,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
70
+ 2024-08-15 03:13:04,284 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
71
+ 2024-08-15 03:13:09,285 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
72
+ 2024-08-15 03:13:14,285 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
73
+ 2024-08-15 03:13:16,973 DEBUG SystemMonitor:10026 [system_monitor.py:_start():172] Starting system metrics aggregation loop
74
+ 2024-08-15 03:13:16,974 DEBUG SenderThread:10026 [sender.py:send():382] send: stats
75
+ 2024-08-15 03:13:17,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
76
+ 2024-08-15 03:13:17,034 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
77
+ 2024-08-15 03:13:17,078 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
78
+ 2024-08-15 03:13:19,286 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
79
+ 2024-08-15 03:13:24,287 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
80
+ 2024-08-15 03:13:29,288 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
81
+ 2024-08-15 03:13:32,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
82
+ 2024-08-15 03:13:32,034 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
83
+ 2024-08-15 03:13:32,078 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
84
+ 2024-08-15 03:13:35,214 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
85
+ 2024-08-15 03:13:40,215 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
86
+ 2024-08-15 03:13:45,216 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
87
+ 2024-08-15 03:13:46,975 DEBUG SenderThread:10026 [sender.py:send():382] send: stats
88
+ 2024-08-15 03:13:47,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
89
+ 2024-08-15 03:13:47,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
90
+ 2024-08-15 03:13:47,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
91
+ 2024-08-15 03:13:50,291 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
92
+ 2024-08-15 03:13:55,292 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
93
+ 2024-08-15 03:14:00,292 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
94
+ 2024-08-15 03:14:01,824 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: partial_history
95
+ 2024-08-15 03:14:01,949 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
96
+ 2024-08-15 03:14:02,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
97
+ 2024-08-15 03:14:02,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
98
+ 2024-08-15 03:14:02,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
99
+ 2024-08-15 03:14:06,235 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
100
+ 2024-08-15 03:14:11,236 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
101
+ 2024-08-15 03:14:16,236 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
102
+ 2024-08-15 03:14:16,976 DEBUG SenderThread:10026 [sender.py:send():382] send: stats
103
+ 2024-08-15 03:14:17,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
104
+ 2024-08-15 03:14:17,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
105
+ 2024-08-15 03:14:17,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
106
+ 2024-08-15 03:14:21,267 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
107
+ 2024-08-15 03:14:26,267 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
108
+ 2024-08-15 03:14:31,268 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
109
+ 2024-08-15 03:14:32,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
110
+ 2024-08-15 03:14:32,035 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
111
+ 2024-08-15 03:14:32,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
112
+ 2024-08-15 03:14:37,220 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
113
+ 2024-08-15 03:14:42,221 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
114
+ 2024-08-15 03:14:46,977 DEBUG SenderThread:10026 [sender.py:send():382] send: stats
115
+ 2024-08-15 03:14:47,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
116
+ 2024-08-15 03:14:47,035 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
117
+ 2024-08-15 03:14:47,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
118
+ 2024-08-15 03:14:48,220 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
119
+ 2024-08-15 03:14:53,220 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
120
+ 2024-08-15 03:14:58,221 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
121
+ 2024-08-15 03:15:02,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
122
+ 2024-08-15 03:15:02,035 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
123
+ 2024-08-15 03:15:02,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
124
+ 2024-08-15 03:15:03,261 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
125
+ 2024-08-15 03:15:05,634 DEBUG SenderThread:10026 [sender.py:send():382] send: exit
126
+ 2024-08-15 03:15:05,634 INFO SenderThread:10026 [sender.py:send_exit():589] handling exit code: 255
127
+ 2024-08-15 03:15:05,634 INFO SenderThread:10026 [sender.py:send_exit():591] handling runtime: 168
128
+ 2024-08-15 03:15:05,636 INFO SenderThread:10026 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
129
+ 2024-08-15 03:15:05,636 INFO SenderThread:10026 [sender.py:send_exit():597] send defer
130
+ 2024-08-15 03:15:05,636 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
131
+ 2024-08-15 03:15:05,636 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 0
132
+ 2024-08-15 03:15:05,636 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
133
+ 2024-08-15 03:15:05,636 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 0
134
+ 2024-08-15 03:15:05,636 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 1
135
+ 2024-08-15 03:15:05,636 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
136
+ 2024-08-15 03:15:05,637 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 1
137
+ 2024-08-15 03:15:05,637 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
138
+ 2024-08-15 03:15:05,637 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 1
139
+ 2024-08-15 03:15:05,637 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 2
140
+ 2024-08-15 03:15:05,637 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
141
+ 2024-08-15 03:15:05,637 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 2
142
+ 2024-08-15 03:15:05,637 INFO HandlerThread:10026 [system_monitor.py:finish():203] Stopping system monitor
143
+ 2024-08-15 03:15:05,637 DEBUG SystemMonitor:10026 [system_monitor.py:_start():179] Finished system metrics aggregation loop
144
+ 2024-08-15 03:15:05,637 DEBUG SystemMonitor:10026 [system_monitor.py:_start():183] Publishing last batch of metrics
145
+ 2024-08-15 03:15:05,637 INFO HandlerThread:10026 [interfaces.py:finish():202] Joined cpu monitor
146
+ 2024-08-15 03:15:05,639 INFO HandlerThread:10026 [interfaces.py:finish():202] Joined disk monitor
147
+ 2024-08-15 03:15:05,671 INFO HandlerThread:10026 [interfaces.py:finish():202] Joined gpu monitor
148
+ 2024-08-15 03:15:05,672 INFO HandlerThread:10026 [interfaces.py:finish():202] Joined memory monitor
149
+ 2024-08-15 03:15:05,672 INFO HandlerThread:10026 [interfaces.py:finish():202] Joined network monitor
150
+ 2024-08-15 03:15:05,672 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
151
+ 2024-08-15 03:15:05,672 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 2
152
+ 2024-08-15 03:15:05,672 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 3
153
+ 2024-08-15 03:15:05,672 DEBUG SenderThread:10026 [sender.py:send():382] send: stats
154
+ 2024-08-15 03:15:05,673 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
155
+ 2024-08-15 03:15:05,673 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 3
156
+ 2024-08-15 03:15:05,676 DEBUG SenderThread:10026 [sender.py:send():382] send: history
157
+ 2024-08-15 03:15:05,676 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: summary_record
158
+ 2024-08-15 03:15:05,677 INFO SenderThread:10026 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
159
+ 2024-08-15 03:15:05,677 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
160
+ 2024-08-15 03:15:05,677 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 3
161
+ 2024-08-15 03:15:05,677 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 4
162
+ 2024-08-15 03:15:05,677 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
163
+ 2024-08-15 03:15:05,677 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 4
164
+ 2024-08-15 03:15:05,677 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
165
+ 2024-08-15 03:15:05,677 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 4
166
+ 2024-08-15 03:15:05,677 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 5
167
+ 2024-08-15 03:15:05,677 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
168
+ 2024-08-15 03:15:05,677 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 5
169
+ 2024-08-15 03:15:05,678 DEBUG SenderThread:10026 [sender.py:send():382] send: summary
170
+ 2024-08-15 03:15:05,679 INFO SenderThread:10026 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
171
+ 2024-08-15 03:15:05,679 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
172
+ 2024-08-15 03:15:05,679 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 5
173
+ 2024-08-15 03:15:05,679 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 6
174
+ 2024-08-15 03:15:05,679 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
175
+ 2024-08-15 03:15:05,679 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 6
176
+ 2024-08-15 03:15:05,680 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
177
+ 2024-08-15 03:15:05,680 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 6
178
+ 2024-08-15 03:15:05,680 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 7
179
+ 2024-08-15 03:15:05,680 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
180
+ 2024-08-15 03:15:05,680 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
181
+ 2024-08-15 03:15:05,680 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 7
182
+ 2024-08-15 03:15:05,680 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
183
+ 2024-08-15 03:15:05,680 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 7
184
+ 2024-08-15 03:15:05,984 INFO Thread-12 :10026 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json
185
+ 2024-08-15 03:15:06,481 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 8
186
+ 2024-08-15 03:15:06,481 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
187
+ 2024-08-15 03:15:06,481 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 8
188
+ 2024-08-15 03:15:06,481 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
189
+ 2024-08-15 03:15:06,481 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 8
190
+ 2024-08-15 03:15:06,481 INFO SenderThread:10026 [job_builder.py:build():296] Attempting to build job artifact
191
+ 2024-08-15 03:15:06,482 INFO SenderThread:10026 [job_builder.py:_get_source_type():426] is repo sourced job
192
+ 2024-08-15 03:15:06,507 INFO SenderThread:10026 [job_builder.py:build():402] adding wandb-job metadata file
193
+ 2024-08-15 03:15:06,516 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 9
194
+ 2024-08-15 03:15:06,517 DEBUG SenderThread:10026 [sender.py:send():382] send: artifact
195
+ 2024-08-15 03:15:06,517 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
196
+ 2024-08-15 03:15:06,518 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 9
197
+ 2024-08-15 03:15:06,633 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: poll_exit
198
+ 2024-08-15 03:15:06,985 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
199
+ 2024-08-15 03:15:08,040 INFO wandb-upload_0:10026 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpb932s___
200
+ 2024-08-15 03:15:08,047 INFO wandb-upload_1:10026 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpl85vnluw
201
+ 2024-08-15 03:15:09,160 INFO SenderThread:10026 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE1MDEyMDEwMQ==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
202
+ 2024-08-15 03:15:09,160 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
203
+ 2024-08-15 03:15:09,160 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 9
204
+ 2024-08-15 03:15:09,160 INFO SenderThread:10026 [dir_watcher.py:finish():358] shutting down directory watcher
205
+ 2024-08-15 03:15:09,986 INFO SenderThread:10026 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240815_031216-0szn78ph/files
206
+ 2024-08-15 03:15:09,987 INFO SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/requirements.txt requirements.txt
207
+ 2024-08-15 03:15:09,987 INFO SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/config.yaml config.yaml
208
+ 2024-08-15 03:15:09,988 INFO SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/wandb-metadata.json wandb-metadata.json
209
+ 2024-08-15 03:15:09,989 INFO SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json wandb-summary.json
210
+ 2024-08-15 03:15:09,990 INFO SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/output.log output.log
211
+ 2024-08-15 03:15:09,992 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 10
212
+ 2024-08-15 03:15:09,992 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: poll_exit
213
+ 2024-08-15 03:15:09,992 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
214
+ 2024-08-15 03:15:09,993 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 10
215
+ 2024-08-15 03:15:09,994 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
216
+ 2024-08-15 03:15:09,994 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 10
217
+ 2024-08-15 03:15:09,994 INFO SenderThread:10026 [file_pusher.py:finish():172] shutting down file pusher
218
+ 2024-08-15 03:15:10,399 INFO wandb-upload_1:10026 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240815_031216-0szn78ph/files/config.yaml
219
+ 2024-08-15 03:15:10,439 INFO wandb-upload_0:10026 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240815_031216-0szn78ph/files/requirements.txt
220
+ 2024-08-15 03:15:10,453 INFO wandb-upload_2:10026 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json
221
+ 2024-08-15 03:15:10,537 INFO wandb-upload_3:10026 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240815_031216-0szn78ph/files/output.log
222
+ 2024-08-15 03:15:10,635 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: poll_exit
223
+ 2024-08-15 03:15:10,635 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: poll_exit
224
+ 2024-08-15 03:15:10,737 INFO Thread-11 (_thread_body):10026 [sender.py:transition_state():617] send defer: 11
225
+ 2024-08-15 03:15:10,738 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
226
+ 2024-08-15 03:15:10,738 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 11
227
+ 2024-08-15 03:15:10,738 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
228
+ 2024-08-15 03:15:10,738 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 11
229
+ 2024-08-15 03:15:10,738 INFO SenderThread:10026 [file_pusher.py:join():178] waiting for file pusher
230
+ 2024-08-15 03:15:10,738 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 12
231
+ 2024-08-15 03:15:10,738 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
232
+ 2024-08-15 03:15:10,738 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 12
233
+ 2024-08-15 03:15:10,738 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
234
+ 2024-08-15 03:15:10,738 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 12
235
+ 2024-08-15 03:15:10,738 INFO SenderThread:10026 [file_stream.py:finish():595] file stream finish called
236
+ 2024-08-15 03:15:11,367 INFO SenderThread:10026 [file_stream.py:finish():599] file stream finish is done
237
+ 2024-08-15 03:15:11,368 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 13
238
+ 2024-08-15 03:15:11,368 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
239
+ 2024-08-15 03:15:11,368 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 13
240
+ 2024-08-15 03:15:11,368 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
241
+ 2024-08-15 03:15:11,368 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 13
242
+ 2024-08-15 03:15:11,368 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 14
243
+ 2024-08-15 03:15:11,369 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
244
+ 2024-08-15 03:15:11,369 DEBUG SenderThread:10026 [sender.py:send():382] send: final
245
+ 2024-08-15 03:15:11,369 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 14
246
+ 2024-08-15 03:15:11,369 DEBUG SenderThread:10026 [sender.py:send():382] send: footer
247
+ 2024-08-15 03:15:11,369 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
248
+ 2024-08-15 03:15:11,369 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 14
249
+ 2024-08-15 03:15:14,370 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
250
+ 2024-08-15 03:15:19,370 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
251
+ 2024-08-15 03:15:24,371 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
252
+ 2024-08-15 03:15:29,371 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
253
+ 2024-08-15 03:15:34,372 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
254
+ 2024-08-15 03:15:37,452 WARNING StreamThr :10026 [internal.py:is_dead():414] Internal process exiting, parent pid 9957 disappeared
255
+ 2024-08-15 03:15:37,452 ERROR StreamThr :10026 [internal.py:wandb_internal():152] Internal process shutdown.
256
+ 2024-08-15 03:15:38,372 INFO SenderThread:10026 [sender.py:finish():1572] shutting down sender
257
+ 2024-08-15 03:15:38,372 INFO SenderThread:10026 [file_pusher.py:finish():172] shutting down file pusher
258
+ 2024-08-15 03:15:38,372 INFO SenderThread:10026 [file_pusher.py:join():178] waiting for file pusher
259
+ 2024-08-15 03:15:38,372 INFO WriterThread:10026 [datastore.py:close():296] close: /project/wandb/run-20240815_031216-0szn78ph/run-0szn78ph.wandb
260
+ 2024-08-15 03:15:38,373 INFO HandlerThread:10026 [handler.py:finish():869] shutting down handler
wandb/run-20240815_031216-0szn78ph/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Configure stats pid to 9957
3
+ 2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
6
+ 2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240815_031216-0szn78ph/logs/debug.log
9
+ 2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240815_031216-0szn78ph/logs/debug-internal.log
10
+ 2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-15 03:12:16,237 INFO MainThread:9957 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-15-03:11:59', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
13
+ 2024-08-15 03:12:16,237 INFO MainThread:9957 [wandb_init.py:init():616] starting backend
14
+ 2024-08-15 03:12:16,237 INFO MainThread:9957 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-15 03:12:16,241 INFO MainThread:9957 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-15 03:12:16,243 INFO MainThread:9957 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-15 03:12:16,248 INFO MainThread:9957 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-15 03:12:16,405 INFO MainThread:9957 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-15 03:12:16,892 INFO MainThread:9957 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-15 03:12:16,915 INFO MainThread:9957 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-15 03:12:16,915 INFO MainThread:9957 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-15 03:12:17,032 INFO MainThread:9957 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-15 03:12:17,033 INFO MainThread:9957 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-15 03:12:17,033 INFO MainThread:9957 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-15 03:12:17,033 INFO MainThread:9957 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-15 03:12:17,034 INFO MainThread:9957 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-15 03:12:43,259 INFO MainThread:9957 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
29
+ 2024-08-15 03:12:43,260 INFO MainThread:9957 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
wandb/run-20240815_031216-0szn78ph/run-0szn78ph.wandb ADDED
Binary file (21.9 kB). View file
 
wandb/run-20240823_162543-eroprw00/files/config.yaml ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '1754785366'
31
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
32
+ - '28623823675'
33
+ - /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
34
+ valid_data_path:
35
+ desc: null
36
+ value:
37
+ - '1754785366'
38
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
39
+ test_data_path:
40
+ desc: null
41
+ value:
42
+ - '1754785366'
43
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
44
+ data_cache_path:
45
+ desc: null
46
+ value: null
47
+ vocab_size:
48
+ desc: null
49
+ value: null
50
+ vocab_file:
51
+ desc: null
52
+ value: null
53
+ merge_file:
54
+ desc: null
55
+ value: null
56
+ seq_length:
57
+ desc: null
58
+ value: 4096
59
+ num_workers:
60
+ desc: null
61
+ value: 2
62
+ tokenizer_type:
63
+ desc: null
64
+ value: HFPreTrainedTokenizer
65
+ tokenizer_model:
66
+ desc: null
67
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
68
+ reset_position_ids:
69
+ desc: null
70
+ value: false
71
+ reset_attention_mask:
72
+ desc: null
73
+ value: false
74
+ eod_mask_loss:
75
+ desc: null
76
+ value: false
77
+ retro_return_doc_ids:
78
+ desc: null
79
+ value: false
80
+ short_seq_prob:
81
+ desc: null
82
+ value: 0.1
83
+ vocab_extra_ids:
84
+ desc: null
85
+ value: 0
86
+ seed:
87
+ desc: null
88
+ value: 1234
89
+ use_mpi:
90
+ desc: null
91
+ value: false
92
+ wandb_entity:
93
+ desc: null
94
+ value: iwakawa-koichi-q5-tohoku-nlp6723
95
+ wandb_name:
96
+ desc: null
97
+ value: Qwen2-0.5b-0.2_train_2024-08-23-16:25:30
98
+ wandb_project:
99
+ desc: null
100
+ value: llm_tutorial-0.2
101
+ quantization:
102
+ desc: null
103
+ value: false
104
+ use_freeze_layers:
105
+ desc: null
106
+ value: false
107
+ freeze_layers:
108
+ desc: null
109
+ value: null
110
+ bf16:
111
+ desc: null
112
+ value: true
113
+ fp16:
114
+ desc: null
115
+ value: false
116
+ mixed_precision:
117
+ desc: null
118
+ value: true
119
+ param_dtype:
120
+ desc: null
121
+ value: null
122
+ load:
123
+ desc: null
124
+ value: /work/llm_recipes/models/Qwen2-0.5b-0.2
125
+ save:
126
+ desc: null
127
+ value: /work/llm_recipes/models/Qwen2-0.5b-0.2
128
+ base_model:
129
+ desc: null
130
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
131
+ use_better_transformer:
132
+ desc: null
133
+ value: false
134
+ grad_clip_norm:
135
+ desc: null
136
+ value: 1.0
137
+ eval_interval:
138
+ desc: null
139
+ value: 10
140
+ save_interval:
141
+ desc: null
142
+ value: 10
143
+ eval_iters:
144
+ desc: null
145
+ value: 10
146
+ optimizer:
147
+ desc: null
148
+ value: anyprecision
149
+ lr:
150
+ desc: null
151
+ value: 2.0e-05
152
+ lr_decay_style:
153
+ desc: null
154
+ value: cosine
155
+ lr_decay_iters:
156
+ desc: null
157
+ value: 7500
158
+ lr_warmup_iters:
159
+ desc: null
160
+ value: 500
161
+ min_lr:
162
+ desc: null
163
+ value: 1.0e-06
164
+ train_iters:
165
+ desc: null
166
+ value: 7500
167
+ train_samples:
168
+ desc: null
169
+ value: null
170
+ global_batch_size:
171
+ desc: null
172
+ value: 640
173
+ micro_batch_size:
174
+ desc: null
175
+ value: 5
176
+ make_vocab_size_divisible_by:
177
+ desc: null
178
+ value: 128
179
+ sliding_window_size:
180
+ desc: null
181
+ value: 131072
182
+ skip_batch:
183
+ desc: null
184
+ value: null
185
+ no_save_optimizer_state:
186
+ desc: null
187
+ value: false
188
+ continual_pretraining:
189
+ desc: null
190
+ value: false
191
+ instruction_tuning:
192
+ desc: null
193
+ value: false
194
+ direct_preference_optimization:
195
+ desc: null
196
+ value: false
197
+ attention_dropout:
198
+ desc: null
199
+ value: 0.1
200
+ hidden_dropout:
201
+ desc: null
202
+ value: 0.1
203
+ weight_decay:
204
+ desc: null
205
+ value: 0.1
206
+ adam_beta1:
207
+ desc: null
208
+ value: 0.9
209
+ adam_beta2:
210
+ desc: null
211
+ value: 0.95
212
+ adam_eps:
213
+ desc: null
214
+ value: 1.0e-06
215
+ hf_transformer_model_dir:
216
+ desc: null
217
+ value: null
218
+ instruction_train_data_path:
219
+ desc: null
220
+ value: null
221
+ instruction_valid_data_path:
222
+ desc: null
223
+ value: null
224
+ epoch:
225
+ desc: null
226
+ value: null
227
+ instruction_dataset_size:
228
+ desc: null
229
+ value: null
230
+ save_sampler_state:
231
+ desc: null
232
+ value: false
233
+ label_smoothing:
234
+ desc: null
235
+ value: 0.0
236
+ save_n_checkpoints:
237
+ desc: null
238
+ value: 10
239
+ hf_repo_id:
240
+ desc: null
241
+ value: koichi12/Qwen2-0.5b-0.2
242
+ create_public_hf_repo:
243
+ desc: null
244
+ value: false
245
+ upload_all_checkpoints_to_hf:
246
+ desc: null
247
+ value: true
248
+ hf_upload_retry_limit:
249
+ desc: null
250
+ value: 2
251
+ exit_duration_in_mins:
252
+ desc: null
253
+ value: null
254
+ source_key:
255
+ desc: null
256
+ value: null
257
+ target_key:
258
+ desc: null
259
+ value: null
260
+ attn_implementation:
261
+ desc: null
262
+ value: flash_attention_2
263
+ efficient_instruction_tuning:
264
+ desc: null
265
+ value: false
266
+ remove_padding_masking:
267
+ desc: null
268
+ value: false
269
+ save_start_iter:
270
+ desc: null
271
+ value: null
272
+ valid_micro_batch_size:
273
+ desc: null
274
+ value: 1
275
+ rank:
276
+ desc: null
277
+ value: 0
278
+ world_size:
279
+ desc: null
280
+ value: 1
281
+ padded_vocab_size:
282
+ desc: null
283
+ value: 151680
284
+ gradient_accumulation_steps:
285
+ desc: null
286
+ value: 128
287
+ _wandb:
288
+ desc: null
289
+ value:
290
+ python_version: 3.10.12
291
+ cli_version: 0.16.3
292
+ framework: huggingface
293
+ huggingface_version: 4.43.3
294
+ is_jupyter_run: false
295
+ is_kaggle_kernel: false
296
+ start_time: 1724397943.202675
297
+ t:
298
+ 1:
299
+ - 1
300
+ - 11
301
+ - 49
302
+ - 55
303
+ - 71
304
+ - 105
305
+ 2:
306
+ - 1
307
+ - 11
308
+ - 49
309
+ - 55
310
+ - 71
311
+ - 105
312
+ 3:
313
+ - 13
314
+ - 16
315
+ - 23
316
+ 4: 3.10.12
317
+ 5: 0.16.3
318
+ 6: 4.43.3
319
+ 8:
320
+ - 5
321
+ 13: linux-x86_64
322
+ model_architecture:
323
+ desc: null
324
+ value: Qwen2ForCausalLM
325
+ activation_function:
326
+ desc: null
327
+ value: silu
328
+ hidden_size:
329
+ desc: null
330
+ value: 896
331
+ model_type:
332
+ desc: null
333
+ value: qwen2
334
+ max_position_embeddings:
335
+ desc: null
336
+ value: 4096
337
+ num_attention_heads:
338
+ desc: null
339
+ value: 14
340
+ num_hidden_layers:
341
+ desc: null
342
+ value: 24
wandb/run-20240823_162543-eroprw00/files/output.log ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ Loading model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
5
+ Loaded model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
6
+ --> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
7
+ --> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
8
+ BFloat16 enabled for mixed precision - using bfSixteen policy
9
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
10
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
11
+ warnings.warn(
12
+ Let split = None
13
+ --> applying fsdp activation checkpointing...
14
+ > datasets target sizes (minimum size):
15
+ train: 4800000
16
+ validation: 4806400
17
+ test: 6400
18
+ > building train, validation, and test datasets for GPT ...
19
+ Unable to save the indexes because path_to_cache is None
20
+ > finished creating GPT datasets ...
21
+ Loading optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
22
+ Building a BlendedDataset for a single MegatronDataset
23
+ Unable to save the indexes because path_to_cache is None
24
+ Building a BlendedDataset for a single MegatronDataset
25
+ Unable to save the indexes because path_to_cache is None
26
+ Loaded optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
27
+ model info: FullyShardedDataParallel(
28
+ (_fsdp_wrapped_module): Qwen2ForCausalLM(
29
+ (model): Qwen2Model(
30
+ (embed_tokens): Embedding(151936, 896)
31
+ (layers): ModuleList(
32
+ (0-23): 24 x FullyShardedDataParallel(
33
+ (_fsdp_wrapped_module): CheckpointWrapper(
34
+ (_checkpoint_wrapped_module): Qwen2DecoderLayer(
35
+ (self_attn): Qwen2FlashAttention2(
36
+ (q_proj): Linear(in_features=896, out_features=896, bias=True)
37
+ (k_proj): Linear(in_features=896, out_features=128, bias=True)
38
+ (v_proj): Linear(in_features=896, out_features=128, bias=True)
39
+ (o_proj): Linear(in_features=896, out_features=896, bias=False)
40
+ (rotary_emb): Qwen2RotaryEmbedding()
41
+ )
42
+ (mlp): Qwen2MLP(
43
+ (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
44
+ (up_proj): Linear(in_features=896, out_features=4864, bias=False)
45
+ (down_proj): Linear(in_features=4864, out_features=896, bias=False)
46
+ (act_fn): SiLU()
47
+ )
48
+ (input_layernorm): Qwen2RMSNorm()
49
+ (post_attention_layernorm): Qwen2RMSNorm()
50
+ )
51
+ )
52
+ )
53
+ )
54
+ (norm): Qwen2RMSNorm()
55
+ )
56
+ (lm_head): Linear(in_features=896, out_features=151936, bias=False)
57
+ )
58
+ )
59
+ model config: Qwen2Config {
60
+ "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
61
+ "architectures": [
62
+ "Qwen2ForCausalLM"
63
+ ],
64
+ "attention_dropout": 0.0,
65
+ "bos_token_id": 151643,
66
+ "eos_token_id": 151643,
67
+ "hidden_act": "silu",
68
+ "hidden_size": 896,
69
+ "initializer_range": 0.02,
70
+ "intermediate_size": 4864,
71
+ "label_smoothing": 0.0,
72
+ "max_position_embeddings": 4096,
73
+ "max_window_layers": 24,
74
+ "model_type": "qwen2",
75
+ "num_attention_heads": 14,
76
+ "num_hidden_layers": 24,
77
+ "num_key_value_heads": 2,
78
+ "rms_norm_eps": 1e-06,
79
+ "rope_theta": 1000000.0,
80
+ "sliding_window": 131072,
81
+ "tie_word_embeddings": true,
82
+ "torch_dtype": "bfloat16",
83
+ "transformers_version": "4.43.3",
84
+ "use_cache": false,
85
+ "use_sliding_window": false,
86
+ "vocab_size": 151936
87
+ }
88
+ [rank0]:[2024-08-23 16:25:50,866] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
89
+ Traceback (most recent call last):
90
+ File "/project/examples/finetuning.py", line 13, in <module>
91
+ main()
92
+ File "/project/src/llama_recipes/finetuning.py", line 282, in main
93
+ train(
94
+ File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
95
+ loss: torch.Tensor = model(**batch).loss
96
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
97
+ return self._call_impl(*args, **kwargs)
98
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
99
+ return forward_call(*args, **kwargs)
100
+ File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
101
+ output = self._fsdp_wrapped_module(*args, **kwargs)
102
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
103
+ return self._call_impl(*args, **kwargs)
104
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
105
+ return forward_call(*args, **kwargs)
106
+ File "/project/lib/transformers/src/transformers/models/qwen2/modeling_qwen2.py", line 1082, in forward
107
+ loss = loss_fct(shift_logits, shift_labels)
108
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
109
+ return self._call_impl(*args, **kwargs)
110
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
111
+ return forward_call(*args, **kwargs)
112
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/loss.py", line 1179, in forward
113
+ return F.cross_entropy(input, target, weight=self.weight,
114
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 3086, in cross_entropy
115
+ return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
116
+ torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 11.59 GiB. GPU 0 has a total capacity of 39.39 GiB of which 11.28 GiB is free. Including non-PyTorch memory, this process has 28.11 GiB memory in use. Of the allocated memory 26.94 GiB is allocated by PyTorch, and 363.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
wandb/run-20240823_162543-eroprw00/files/requirements.txt ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.23.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyio==4.4.0
8
+ apex==0.1
9
+ appdirs==1.4.4
10
+ argon2-cffi-bindings==21.2.0
11
+ argon2-cffi==23.1.0
12
+ astroid==3.2.4
13
+ asttokens==2.4.1
14
+ astunparse==1.6.3
15
+ async-timeout==4.0.3
16
+ attrs==23.2.0
17
+ audioread==3.0.1
18
+ beautifulsoup4==4.12.3
19
+ bert-score==0.3.13
20
+ bleach==6.1.0
21
+ blis==0.7.11
22
+ build==1.2.1
23
+ cachecontrol==0.14.0
24
+ cachetools==5.3.2
25
+ catalogue==2.0.10
26
+ certifi==2024.2.2
27
+ cffi==1.16.0
28
+ chardet==5.2.0
29
+ charset-normalizer==3.3.2
30
+ cleo==2.1.0
31
+ click==8.1.7
32
+ cloudpathlib==0.16.0
33
+ cloudpickle==3.0.0
34
+ cmake==3.28.1
35
+ colorama==0.4.6
36
+ comm==0.2.1
37
+ confection==0.1.4
38
+ contourpy==1.2.0
39
+ cramjam==2.8.3
40
+ crashtest==0.4.1
41
+ cryptography==43.0.0
42
+ cubinlinker==0.3.0+2.g405ac64
43
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
44
+ cudf==23.12.0
45
+ cugraph-dgl==23.12.0
46
+ cugraph-service-client==23.12.0
47
+ cugraph-service-server==23.12.0
48
+ cugraph==23.12.0
49
+ cuml==23.12.0
50
+ cupy-cuda12x==12.3.0
51
+ cycler==0.12.1
52
+ cymem==2.0.8
53
+ cython==3.0.8
54
+ dask-cuda==23.12.0
55
+ dask-cudf==23.12.0
56
+ dask==2023.11.0
57
+ dataclasses-json==0.6.7
58
+ dataproperty==1.0.1
59
+ datasets==2.20.0
60
+ debugpy==1.8.1
61
+ decorator==5.1.1
62
+ defusedxml==0.7.1
63
+ dill==0.3.8
64
+ distlib==0.3.8
65
+ distributed==2023.11.0
66
+ distro==1.9.0
67
+ dm-tree==0.1.8
68
+ docker-pycreds==0.4.0
69
+ dulwich==0.21.7
70
+ einops==0.7.0
71
+ emoji==2.12.1
72
+ entmax==1.3
73
+ evaluate==0.4.2
74
+ exceptiongroup==1.2.0
75
+ execnet==2.0.2
76
+ executing==2.0.1
77
+ expecttest==0.1.3
78
+ fastjsonschema==2.19.1
79
+ fastparquet==2023.10.1
80
+ fastrlock==0.8.2
81
+ filelock==3.13.1
82
+ flash-attn==2.4.2
83
+ fonttools==4.48.1
84
+ frozenlist==1.4.1
85
+ fsspec==2023.12.2
86
+ fugashi==1.3.2
87
+ fuzzywuzzy==0.18.0
88
+ gast==0.5.4
89
+ gitdb==4.0.11
90
+ gitpython==3.1.43
91
+ google-auth-oauthlib==0.4.6
92
+ google-auth==2.27.0
93
+ graphsurgeon==0.4.6
94
+ greenlet==3.0.3
95
+ grpcio==1.60.1
96
+ h11==0.14.0
97
+ httpcore==1.0.5
98
+ httpx==0.27.0
99
+ huggingface-hub==0.24.5
100
+ hydra-core==1.3.2
101
+ hypothesis==5.35.1
102
+ idna==3.6
103
+ importlib-metadata==7.0.1
104
+ iniconfig==2.0.0
105
+ installer==0.7.0
106
+ intel-openmp==2021.4.0
107
+ ipadic==1.0.0
108
+ ipykernel==6.29.2
109
+ ipython-genutils==0.2.0
110
+ ipython==8.21.0
111
+ isort==5.13.2
112
+ jaraco.classes==3.4.0
113
+ jedi==0.19.1
114
+ jeepney==0.8.0
115
+ jinja2==3.1.3
116
+ jiter==0.5.0
117
+ joblib==1.3.2
118
+ json5==0.9.14
119
+ jsonargparse==3.13.1
120
+ jsonlines==4.0.0
121
+ jsonnet==0.19.1
122
+ jsonpatch==1.33
123
+ jsonpointer==3.0.0
124
+ jsonschema-specifications==2023.12.1
125
+ jsonschema==4.21.1
126
+ jupyter-client==8.6.0
127
+ jupyter-core==5.7.1
128
+ jupyter-tensorboard==0.2.0
129
+ jupyterlab-pygments==0.3.0
130
+ jupyterlab-server==1.2.0
131
+ jupyterlab==2.3.2
132
+ jupytext==1.16.1
133
+ keyring==24.3.1
134
+ kiwisolver==1.4.5
135
+ langchain-community==0.2.12
136
+ langchain-core==0.2.31
137
+ langchain-huggingface==0.0.2
138
+ langchain-openai==0.1.21
139
+ langchain-text-splitters==0.2.2
140
+ langchain==0.2.13
141
+ langcodes==3.3.0
142
+ langsmith==0.1.99
143
+ lazy-loader==0.3
144
+ levenshtein==0.25.1
145
+ librosa==0.10.1
146
+ lightning-utilities==0.11.6
147
+ llm-jp-eval==1.4.0
148
+ llvmlite==0.40.1
149
+ lm-eval==0.3.0
150
+ locket==1.0.0
151
+ logzero==1.7.0
152
+ lxml==5.2.2
153
+ markdown-it-py==3.0.0
154
+ markdown==3.5.2
155
+ markupsafe==2.1.4
156
+ marshmallow==3.21.3
157
+ matplotlib-inline==0.1.6
158
+ matplotlib==3.8.2
159
+ mbstrdecoder==1.1.3
160
+ mccabe==0.7.0
161
+ mdit-py-plugins==0.4.0
162
+ mdurl==0.1.2
163
+ mecab-python3==1.0.6
164
+ mistune==3.0.2
165
+ mkl-devel==2021.1.1
166
+ mkl-include==2021.1.1
167
+ mkl==2021.1.1
168
+ mock==5.1.0
169
+ mojimoji==0.0.13
170
+ more-itertools==9.1.0
171
+ mpmath==1.3.0
172
+ msgpack==1.0.7
173
+ multidict==6.0.4
174
+ multiprocess==0.70.16
175
+ murmurhash==1.0.10
176
+ mypy-extensions==1.0.0
177
+ nbclient==0.9.0
178
+ nbconvert==7.16.0
179
+ nbformat==5.9.2
180
+ neologdn==0.5.3
181
+ nest-asyncio==1.6.0
182
+ networkx==2.6.3
183
+ ninja==1.11.1.1
184
+ nltk==3.8.1
185
+ notebook==6.4.10
186
+ numba==0.57.1+1.g1ff679645
187
+ numexpr==2.10.1
188
+ numpy==1.24.4
189
+ nvfuser==0.1.4a0+d0bb811
190
+ nvidia-dali-cuda120==1.34.0
191
+ nvidia-pyindex==1.0.9
192
+ nvtx==0.2.5
193
+ oauthlib==3.2.2
194
+ omegaconf==2.3.0
195
+ onnx==1.15.0rc2
196
+ openai==1.40.6
197
+ opencv==4.7.0
198
+ optree==0.10.0
199
+ orjson==3.10.7
200
+ packaging==23.2
201
+ pandas==2.2.2
202
+ pandocfilters==1.5.1
203
+ parso==0.8.3
204
+ partd==1.4.1
205
+ pathvalidate==3.2.0
206
+ peft==0.5.0
207
+ pexpect==4.9.0
208
+ pillow==10.2.0
209
+ pip==24.0
210
+ pkginfo==1.11.1
211
+ plac==1.4.3
212
+ platformdirs==4.2.0
213
+ pluggy==1.4.0
214
+ ply==3.11
215
+ poetry-core==1.9.0
216
+ poetry-plugin-export==1.8.0
217
+ poetry==1.8.3
218
+ polygraphy==0.49.4
219
+ pooch==1.8.0
220
+ portalocker==2.10.1
221
+ preshed==3.0.9
222
+ prettytable==3.9.0
223
+ prometheus-client==0.19.0
224
+ prompt-toolkit==3.0.43
225
+ protobuf==4.24.4
226
+ psutil==5.9.4
227
+ ptxcompiler==0.8.1+2.g0d406d6
228
+ ptyprocess==0.7.0
229
+ pure-eval==0.2.2
230
+ pyarrow-hotfix==0.6
231
+ pyarrow==15.0.2
232
+ pyasn1-modules==0.3.0
233
+ pyasn1==0.5.1
234
+ pybind11-global==2.11.1
235
+ pybind11==2.11.1
236
+ pycocotools==2.0+nv0.8.0
237
+ pycountry==24.6.1
238
+ pycparser==2.21
239
+ pydantic-core==2.16.2
240
+ pydantic==2.6.1
241
+ pygments==2.17.2
242
+ pylibcugraph==23.12.0
243
+ pylibcugraphops==23.12.0
244
+ pylibraft==23.12.0
245
+ pylint==3.2.6
246
+ pynvml==11.4.1
247
+ pyparsing==3.1.1
248
+ pyproject-hooks==1.1.0
249
+ pytablewriter==1.2.0
250
+ pytest-flakefinder==1.1.0
251
+ pytest-rerunfailures==13.0
252
+ pytest-shard==0.1.2
253
+ pytest-xdist==3.5.0
254
+ pytest==8.0.0
255
+ python-dateutil==2.8.2
256
+ python-dotenv==1.0.0
257
+ python-hostlist==1.23.0
258
+ python-levenshtein==0.25.1
259
+ pytorch-lightning==2.4.0
260
+ pytorch-quantization==2.1.2
261
+ pytz==2023.3.post1
262
+ pyyaml==6.0.1
263
+ pyzmq==25.1.2
264
+ raft-dask==23.12.0
265
+ rapidfuzz==3.9.6
266
+ rapids-dask-dependency==23.12.1
267
+ referencing==0.33.0
268
+ regex==2023.12.25
269
+ requests-oauthlib==1.3.1
270
+ requests-toolbelt==1.0.0
271
+ requests==2.32.3
272
+ rhoknp==1.7.0
273
+ rich==13.7.0
274
+ rmm==23.12.0
275
+ rouge-score==0.1.2
276
+ rpds-py==0.17.1
277
+ rsa==4.9
278
+ sacrebleu==2.4.2
279
+ safetensors==0.4.3
280
+ scikit-learn==1.5.1
281
+ scipy==1.12.0
282
+ secretstorage==3.3.3
283
+ send2trash==1.8.2
284
+ sentence-transformers==3.0.1
285
+ sentencepiece==0.1.99
286
+ sentry-sdk==2.12.0
287
+ setproctitle==1.3.3
288
+ setuptools==68.2.2
289
+ shellingham==1.5.4
290
+ six==1.16.0
291
+ smart-open==6.4.0
292
+ smmap==5.0.1
293
+ sniffio==1.3.1
294
+ sortedcontainers==2.4.0
295
+ soundfile==0.12.1
296
+ soupsieve==2.5
297
+ soxr==0.3.7
298
+ spacy-legacy==3.0.12
299
+ spacy-loggers==1.0.5
300
+ spacy==3.7.2
301
+ sphinx-glpi-theme==0.6
302
+ sqlalchemy==2.0.32
303
+ sqlitedict==2.1.0
304
+ srsly==2.4.8
305
+ stack-data==0.6.3
306
+ sumeval==0.2.2
307
+ sympy==1.12
308
+ tabledata==1.3.3
309
+ tabulate==0.9.0
310
+ tbb==2021.11.0
311
+ tblib==3.0.0
312
+ tcolorpy==0.1.6
313
+ tenacity==8.5.0
314
+ tensorboard-data-server==0.6.1
315
+ tensorboard-plugin-wit==1.8.1
316
+ tensorboard==2.9.0
317
+ tensorrt==8.6.3
318
+ terminado==0.18.0
319
+ termplotlib==0.3.9
320
+ text-generation==0.7.0
321
+ thinc==8.2.3
322
+ threadpoolctl==3.2.0
323
+ thriftpy2==0.4.17
324
+ tiktoken==0.7.0
325
+ tinycss2==1.2.1
326
+ tokenizers==0.19.1
327
+ toml==0.10.2
328
+ tomli==2.0.1
329
+ tomlkit==0.13.2
330
+ toolz==0.12.1
331
+ torch-tensorrt==2.3.0a0
332
+ torch==2.3.0a0+ebedce2
333
+ torchdata==0.7.1a0
334
+ torchmetrics==0.10.3
335
+ torchtext==0.17.0a0
336
+ torchvision==0.18.0a0
337
+ tornado==6.4
338
+ tqdm-multiprocess==0.0.11
339
+ tqdm==4.66.5
340
+ traitlets==5.9.0
341
+ transformer-engine==1.3.0+5b90b7f
342
+ transformers==4.43.3
343
+ treelite-runtime==3.9.1
344
+ treelite==3.9.1
345
+ triton==2.2.0+e28a256
346
+ trove-classifiers==2024.7.2
347
+ typepy==1.3.2
348
+ typer==0.9.0
349
+ types-dataclasses==0.6.6
350
+ typing-extensions==4.12.2
351
+ typing-inspect==0.9.0
352
+ tzdata==2024.1
353
+ ucx-py==0.35.0
354
+ uff==0.6.9
355
+ ujson==5.8.0
356
+ unbabel-comet==2.2.2
357
+ unidic-lite==1.0.8
358
+ urllib3==1.26.18
359
+ virtualenv==20.26.3
360
+ wandb==0.16.3
361
+ wasabi==1.1.2
362
+ wcwidth==0.2.13
363
+ weasel==0.3.4
364
+ webencodings==0.5.1
365
+ werkzeug==3.0.1
366
+ wheel==0.42.0
367
+ word2number==1.1
368
+ xdoctest==1.0.2
369
+ xgboost==1.7.6
370
+ xmltodict==0.13.0
371
+ xxhash==3.4.1
372
+ yarl==1.9.4
373
+ zict==3.0.0
374
+ zipp==3.17.0
375
+ zstandard==0.23.0
wandb/run-20240823_162543-eroprw00/files/wandb-metadata.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-23T07:25:43.758914",
5
+ "startedAt": "2024-08-23T07:25:43.187250",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "4096",
11
+ "--sliding-window-size",
12
+ "131072",
13
+ "--micro-batch-size",
14
+ "5",
15
+ "--valid_micro_batch_size",
16
+ "1",
17
+ "--global-batch-size",
18
+ "640",
19
+ "--train-iters",
20
+ "7500",
21
+ "--tokenizer-type",
22
+ "HFPreTrainedTokenizer",
23
+ "--tokenizer-model",
24
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
25
+ "--train-data-path",
26
+ "1754785366",
27
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
28
+ "28623823675",
29
+ "/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
30
+ "--valid-data-path",
31
+ "1754785366",
32
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
33
+ "--test-data-path",
34
+ "1754785366",
35
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
36
+ "--lr",
37
+ "2e-5",
38
+ "--min-lr",
39
+ "1e-6",
40
+ "--lr-decay-style",
41
+ "cosine",
42
+ "--lr-warmup-iters",
43
+ "500",
44
+ "--lr-decay-iters",
45
+ "7500",
46
+ "--weight-decay",
47
+ "0.1",
48
+ "--grad-clip-norm",
49
+ "1.0",
50
+ "--optimizer",
51
+ "anyprecision",
52
+ "--adam-beta1",
53
+ "0.9",
54
+ "--adam-beta2",
55
+ "0.95",
56
+ "--adam-eps",
57
+ "1e-6",
58
+ "--save-interval",
59
+ "10",
60
+ "--eval-interval",
61
+ "10",
62
+ "--eval-iters",
63
+ "10",
64
+ "--bf16",
65
+ "--mixed-precision",
66
+ "--base-model",
67
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
68
+ "--save",
69
+ "/work/llm_recipes/models/Qwen2-0.5b-0.2",
70
+ "--load",
71
+ "/work/llm_recipes/models/Qwen2-0.5b-0.2",
72
+ "--fsdp-activation-checkpointing",
73
+ "--sharding-strategy",
74
+ "FULL_SHARD",
75
+ "--checkpoint-type",
76
+ "LOCAL_STATE_DICT",
77
+ "--save-n-checkpoints",
78
+ "10",
79
+ "--upload-all-checkpoints-to-hf",
80
+ "--hf-upload-retry-limit",
81
+ "2",
82
+ "--hf-repo-id",
83
+ "koichi12/Qwen2-0.5b-0.2",
84
+ "--wandb-entity",
85
+ "iwakawa-koichi-q5-tohoku-nlp6723",
86
+ "--wandb-project",
87
+ "llm_tutorial-0.2",
88
+ "--wandb-name",
89
+ "Qwen2-0.5b-0.2_train_2024-08-23-16:25:30"
90
+ ],
91
+ "state": "running",
92
+ "program": "/project/examples/finetuning.py",
93
+ "codePathLocal": "examples/finetuning.py",
94
+ "codePath": "examples/finetuning.py",
95
+ "git": {
96
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
97
+ "commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
98
+ },
99
+ "email": null,
100
+ "root": "/project",
101
+ "host": "gpu-koiwa-00",
102
+ "username": "koiwa",
103
+ "executable": "/usr/bin/python",
104
+ "cpu_count": 18,
105
+ "cpu_count_logical": 18,
106
+ "cpu_freq": {
107
+ "current": 2400.0389999999993,
108
+ "min": 0.0,
109
+ "max": 0.0
110
+ },
111
+ "cpu_freq_per_core": [
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ },
197
+ {
198
+ "current": 2400.039,
199
+ "min": 0.0,
200
+ "max": 0.0
201
+ }
202
+ ],
203
+ "disk": {
204
+ "/": {
205
+ "total": 0.0625,
206
+ "used": 1.1444091796875e-05
207
+ }
208
+ },
209
+ "gpu": "NVIDIA A100-SXM4-40GB",
210
+ "gpu_count": 1,
211
+ "gpu_devices": [
212
+ {
213
+ "name": "NVIDIA A100-SXM4-40GB",
214
+ "memory_total": 42949672960
215
+ }
216
+ ],
217
+ "memory": {
218
+ "total": 56.487831115722656
219
+ }
220
+ }
wandb/run-20240823_162543-eroprw00/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 8}}
wandb/run-20240823_162543-eroprw00/logs/debug-internal.log ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-23 16:25:43,204 INFO StreamThr :11284 [internal.py:wandb_internal():86] W&B internal server running at pid: 11284, started at: 2024-08-23 16:25:43.204013
2
+ 2024-08-23 16:25:43,206 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-23 16:25:43,207 INFO WriterThread:11284 [datastore.py:open_for_write():87] open: /project/wandb/run-20240823_162543-eroprw00/run-eroprw00.wandb
4
+ 2024-08-23 16:25:43,208 DEBUG SenderThread:11284 [sender.py:send():382] send: header
5
+ 2024-08-23 16:25:43,222 DEBUG SenderThread:11284 [sender.py:send():382] send: run
6
+ 2024-08-23 16:25:43,662 INFO SenderThread:11284 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240823_162543-eroprw00/files
7
+ 2024-08-23 16:25:43,662 INFO SenderThread:11284 [sender.py:_start_run_threads():1136] run started: eroprw00 with start time 1724397943.202675
8
+ 2024-08-23 16:25:43,667 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-23 16:25:43,668 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-23 16:25:43,739 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-23 16:25:43,746 DEBUG HandlerThread:11284 [system_info.py:__init__():27] System info init
12
+ 2024-08-23 16:25:43,746 DEBUG HandlerThread:11284 [system_info.py:__init__():42] System info init done
13
+ 2024-08-23 16:25:43,746 INFO HandlerThread:11284 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-23 16:25:43,746 INFO SystemMonitor:11284 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-23 16:25:43,746 INFO HandlerThread:11284 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-23 16:25:43,746 INFO SystemMonitor:11284 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-23 16:25:43,747 INFO SystemMonitor:11284 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-23 16:25:43,747 INFO SystemMonitor:11284 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-23 16:25:43,748 INFO SystemMonitor:11284 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-23 16:25:43,749 INFO SystemMonitor:11284 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-23 16:25:43,758 DEBUG HandlerThread:11284 [system_info.py:probe():151] Probing system
22
+ 2024-08-23 16:25:43,760 DEBUG HandlerThread:11284 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-23 16:25:43,773 DEBUG HandlerThread:11284 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-23 16:25:43,773 DEBUG HandlerThread:11284 [system_info.py:probe():199] Probing system done
25
+ 2024-08-23 16:25:43,773 DEBUG HandlerThread:11284 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-23T07:25:43.758914', 'startedAt': '2024-08-23T07:25:43.187250', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '131072', '--micro-batch-size', '5', '--valid_micro_batch_size', '1', '--global-batch-size', '640', '--train-iters', '7500', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--test-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '7500', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--load', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/Qwen2-0.5b-0.2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial-0.2', '--wandb-name', 'Qwen2-0.5b-0.2_train_2024-08-23-16:25:30'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487831115722656}}
26
+ 2024-08-23 16:25:43,773 INFO HandlerThread:11284 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-23 16:25:43,773 INFO HandlerThread:11284 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-23 16:25:43,774 INFO HandlerThread:11284 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-23 16:25:43,780 DEBUG SenderThread:11284 [sender.py:send():382] send: files
30
+ 2024-08-23 16:25:43,780 INFO SenderThread:11284 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-23 16:25:43,791 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-23 16:25:43,791 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: internal_messages
33
+ 2024-08-23 16:25:43,792 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: stop_status
34
+ 2024-08-23 16:25:43,792 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-23 16:25:43,794 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-23 16:25:44,074 DEBUG SenderThread:11284 [sender.py:send():382] send: telemetry
37
+ 2024-08-23 16:25:44,478 INFO wandb-upload_0:11284 [upload_job.py:push():131] Uploaded file /tmp/tmpn8dztdufwandb/9bfyl56b-wandb-metadata.json
38
+ 2024-08-23 16:25:44,664 INFO Thread-12 :11284 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162543-eroprw00/files/wandb-metadata.json
39
+ 2024-08-23 16:25:44,664 INFO Thread-12 :11284 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162543-eroprw00/files/requirements.txt
40
+ 2024-08-23 16:25:44,664 INFO Thread-12 :11284 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162543-eroprw00/files/output.log
41
+ 2024-08-23 16:25:46,664 INFO Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
42
+ 2024-08-23 16:25:48,665 INFO Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
43
+ 2024-08-23 16:25:49,201 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: status_report
44
+ 2024-08-23 16:25:50,667 INFO Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
45
+ 2024-08-23 16:25:51,139 DEBUG SenderThread:11284 [sender.py:send():382] send: config
46
+ 2024-08-23 16:25:51,140 DEBUG SenderThread:11284 [sender.py:send():382] send: config
47
+ 2024-08-23 16:25:52,592 DEBUG SenderThread:11284 [sender.py:send():382] send: exit
48
+ 2024-08-23 16:25:52,592 INFO SenderThread:11284 [sender.py:send_exit():589] handling exit code: 1
49
+ 2024-08-23 16:25:52,592 INFO SenderThread:11284 [sender.py:send_exit():591] handling runtime: 8
50
+ 2024-08-23 16:25:52,593 INFO SenderThread:11284 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
51
+ 2024-08-23 16:25:52,594 INFO SenderThread:11284 [sender.py:send_exit():597] send defer
52
+ 2024-08-23 16:25:52,594 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
53
+ 2024-08-23 16:25:52,594 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 0
54
+ 2024-08-23 16:25:52,594 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
55
+ 2024-08-23 16:25:52,594 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 0
56
+ 2024-08-23 16:25:52,594 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 1
57
+ 2024-08-23 16:25:52,594 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
58
+ 2024-08-23 16:25:52,594 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 1
59
+ 2024-08-23 16:25:52,595 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
60
+ 2024-08-23 16:25:52,595 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 1
61
+ 2024-08-23 16:25:52,595 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 2
62
+ 2024-08-23 16:25:52,595 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
63
+ 2024-08-23 16:25:52,595 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 2
64
+ 2024-08-23 16:25:52,595 INFO HandlerThread:11284 [system_monitor.py:finish():203] Stopping system monitor
65
+ 2024-08-23 16:25:52,595 DEBUG SystemMonitor:11284 [system_monitor.py:_start():172] Starting system metrics aggregation loop
66
+ 2024-08-23 16:25:52,595 INFO HandlerThread:11284 [interfaces.py:finish():202] Joined cpu monitor
67
+ 2024-08-23 16:25:52,595 DEBUG SystemMonitor:11284 [system_monitor.py:_start():179] Finished system metrics aggregation loop
68
+ 2024-08-23 16:25:52,595 INFO HandlerThread:11284 [interfaces.py:finish():202] Joined disk monitor
69
+ 2024-08-23 16:25:52,596 DEBUG SystemMonitor:11284 [system_monitor.py:_start():183] Publishing last batch of metrics
70
+ 2024-08-23 16:25:52,629 INFO HandlerThread:11284 [interfaces.py:finish():202] Joined gpu monitor
71
+ 2024-08-23 16:25:52,629 INFO HandlerThread:11284 [interfaces.py:finish():202] Joined memory monitor
72
+ 2024-08-23 16:25:52,629 INFO HandlerThread:11284 [interfaces.py:finish():202] Joined network monitor
73
+ 2024-08-23 16:25:52,629 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
74
+ 2024-08-23 16:25:52,629 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 2
75
+ 2024-08-23 16:25:52,629 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 3
76
+ 2024-08-23 16:25:52,629 DEBUG SenderThread:11284 [sender.py:send():382] send: stats
77
+ 2024-08-23 16:25:52,629 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
78
+ 2024-08-23 16:25:52,630 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 3
79
+ 2024-08-23 16:25:52,630 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
80
+ 2024-08-23 16:25:52,630 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 3
81
+ 2024-08-23 16:25:52,630 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 4
82
+ 2024-08-23 16:25:52,630 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
83
+ 2024-08-23 16:25:52,630 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 4
84
+ 2024-08-23 16:25:52,630 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
85
+ 2024-08-23 16:25:52,630 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 4
86
+ 2024-08-23 16:25:52,630 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 5
87
+ 2024-08-23 16:25:52,630 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
88
+ 2024-08-23 16:25:52,631 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 5
89
+ 2024-08-23 16:25:52,631 DEBUG SenderThread:11284 [sender.py:send():382] send: summary
90
+ 2024-08-23 16:25:52,632 INFO SenderThread:11284 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
91
+ 2024-08-23 16:25:52,632 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
92
+ 2024-08-23 16:25:52,632 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 5
93
+ 2024-08-23 16:25:52,632 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 6
94
+ 2024-08-23 16:25:52,632 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
95
+ 2024-08-23 16:25:52,632 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 6
96
+ 2024-08-23 16:25:52,632 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
97
+ 2024-08-23 16:25:52,632 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 6
98
+ 2024-08-23 16:25:52,635 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: status_report
99
+ 2024-08-23 16:25:52,668 INFO Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
100
+ 2024-08-23 16:25:52,668 INFO Thread-12 :11284 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162543-eroprw00/files/wandb-summary.json
101
+ 2024-08-23 16:25:52,831 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 7
102
+ 2024-08-23 16:25:52,831 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
103
+ 2024-08-23 16:25:52,831 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 7
104
+ 2024-08-23 16:25:52,831 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
105
+ 2024-08-23 16:25:52,831 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 7
106
+ 2024-08-23 16:25:53,592 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: poll_exit
107
+ 2024-08-23 16:25:53,669 INFO Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/config.yaml
108
+ 2024-08-23 16:25:54,373 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 8
109
+ 2024-08-23 16:25:54,374 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: poll_exit
110
+ 2024-08-23 16:25:54,374 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
111
+ 2024-08-23 16:25:54,374 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 8
112
+ 2024-08-23 16:25:54,374 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
113
+ 2024-08-23 16:25:54,374 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 8
114
+ 2024-08-23 16:25:54,374 INFO SenderThread:11284 [job_builder.py:build():296] Attempting to build job artifact
115
+ 2024-08-23 16:25:54,375 INFO SenderThread:11284 [job_builder.py:_get_source_type():426] is repo sourced job
116
+ 2024-08-23 16:25:54,389 INFO SenderThread:11284 [job_builder.py:build():402] adding wandb-job metadata file
117
+ 2024-08-23 16:25:54,398 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 9
118
+ 2024-08-23 16:25:54,398 DEBUG SenderThread:11284 [sender.py:send():382] send: artifact
119
+ 2024-08-23 16:25:54,398 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
120
+ 2024-08-23 16:25:54,399 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 9
121
+ 2024-08-23 16:25:54,593 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: poll_exit
122
+ 2024-08-23 16:25:54,670 INFO Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
123
+ 2024-08-23 16:25:55,372 INFO SenderThread:11284 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MTk5MDU4OQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjQ1ODQ1MA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE2MTk5MDU4OQ==', 'versionIndex': 2}}}
124
+ 2024-08-23 16:25:55,372 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
125
+ 2024-08-23 16:25:55,372 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 9
126
+ 2024-08-23 16:25:55,372 INFO SenderThread:11284 [dir_watcher.py:finish():358] shutting down directory watcher
127
+ 2024-08-23 16:25:55,671 INFO SenderThread:11284 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240823_162543-eroprw00/files
128
+ 2024-08-23 16:25:55,671 INFO SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/requirements.txt requirements.txt
129
+ 2024-08-23 16:25:55,671 INFO SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/config.yaml config.yaml
130
+ 2024-08-23 16:25:55,673 INFO SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/wandb-metadata.json wandb-metadata.json
131
+ 2024-08-23 16:25:55,673 INFO SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/wandb-summary.json wandb-summary.json
132
+ 2024-08-23 16:25:55,674 INFO SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/output.log output.log
133
+ 2024-08-23 16:25:55,676 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 10
134
+ 2024-08-23 16:25:55,676 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: poll_exit
135
+ 2024-08-23 16:25:55,676 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
136
+ 2024-08-23 16:25:55,677 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 10
137
+ 2024-08-23 16:25:55,678 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
138
+ 2024-08-23 16:25:55,678 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 10
139
+ 2024-08-23 16:25:55,678 INFO SenderThread:11284 [file_pusher.py:finish():172] shutting down file pusher
140
+ 2024-08-23 16:25:56,071 INFO wandb-upload_0:11284 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162543-eroprw00/files/requirements.txt
141
+ 2024-08-23 16:25:56,117 INFO wandb-upload_1:11284 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162543-eroprw00/files/config.yaml
142
+ 2024-08-23 16:25:56,151 INFO wandb-upload_3:11284 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162543-eroprw00/files/output.log
143
+ 2024-08-23 16:25:56,152 INFO wandb-upload_2:11284 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162543-eroprw00/files/wandb-summary.json
144
+ 2024-08-23 16:25:56,353 INFO Thread-11 (_thread_body):11284 [sender.py:transition_state():617] send defer: 11
145
+ 2024-08-23 16:25:56,353 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
146
+ 2024-08-23 16:25:56,353 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 11
147
+ 2024-08-23 16:25:56,353 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
148
+ 2024-08-23 16:25:56,353 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 11
149
+ 2024-08-23 16:25:56,353 INFO SenderThread:11284 [file_pusher.py:join():178] waiting for file pusher
150
+ 2024-08-23 16:25:56,353 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 12
151
+ 2024-08-23 16:25:56,354 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
152
+ 2024-08-23 16:25:56,354 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 12
153
+ 2024-08-23 16:25:56,354 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
154
+ 2024-08-23 16:25:56,354 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 12
155
+ 2024-08-23 16:25:56,354 INFO SenderThread:11284 [file_stream.py:finish():595] file stream finish called
156
+ 2024-08-23 16:25:56,522 INFO SenderThread:11284 [file_stream.py:finish():599] file stream finish is done
157
+ 2024-08-23 16:25:56,522 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 13
158
+ 2024-08-23 16:25:56,523 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
159
+ 2024-08-23 16:25:56,523 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 13
160
+ 2024-08-23 16:25:56,523 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
161
+ 2024-08-23 16:25:56,523 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 13
162
+ 2024-08-23 16:25:56,523 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 14
163
+ 2024-08-23 16:25:56,523 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
164
+ 2024-08-23 16:25:56,523 DEBUG SenderThread:11284 [sender.py:send():382] send: final
165
+ 2024-08-23 16:25:56,523 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 14
166
+ 2024-08-23 16:25:56,523 DEBUG SenderThread:11284 [sender.py:send():382] send: footer
167
+ 2024-08-23 16:25:56,524 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
168
+ 2024-08-23 16:25:56,524 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 14
169
+ 2024-08-23 16:25:56,524 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: poll_exit
170
+ 2024-08-23 16:25:56,524 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: poll_exit
171
+ 2024-08-23 16:25:56,524 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: poll_exit
172
+ 2024-08-23 16:25:56,525 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: poll_exit
173
+ 2024-08-23 16:25:56,525 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: server_info
174
+ 2024-08-23 16:25:56,525 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: get_summary
175
+ 2024-08-23 16:25:56,525 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: server_info
176
+ 2024-08-23 16:25:56,527 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: sampled_history
177
+ 2024-08-23 16:25:56,527 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: internal_messages
178
+ 2024-08-23 16:25:56,527 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: job_info
179
+ 2024-08-23 16:25:56,684 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: job_info
180
+ 2024-08-23 16:25:56,684 INFO MainThread:11284 [wandb_run.py:_footer_history_summary_info():3866] rendering history
181
+ 2024-08-23 16:25:56,685 INFO MainThread:11284 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
182
+ 2024-08-23 16:25:56,685 INFO MainThread:11284 [wandb_run.py:_footer_sync_info():3825] logging synced files
183
+ 2024-08-23 16:25:56,685 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: shutdown
184
+ 2024-08-23 16:25:56,685 INFO HandlerThread:11284 [handler.py:finish():869] shutting down handler
185
+ 2024-08-23 16:25:57,528 INFO WriterThread:11284 [datastore.py:close():296] close: /project/wandb/run-20240823_162543-eroprw00/run-eroprw00.wandb
186
+ 2024-08-23 16:25:57,685 INFO SenderThread:11284 [sender.py:finish():1572] shutting down sender
187
+ 2024-08-23 16:25:57,685 INFO SenderThread:11284 [file_pusher.py:finish():172] shutting down file pusher
188
+ 2024-08-23 16:25:57,685 INFO SenderThread:11284 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240823_162543-eroprw00/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Configure stats pid to 11213
3
+ 2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
6
+ 2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240823_162543-eroprw00/logs/debug.log
9
+ 2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240823_162543-eroprw00/logs/debug-internal.log
10
+ 2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-23 16:25:43,197 INFO MainThread:11213 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'test_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'Qwen2-0.5b-0.2_train_2024-08-23-16:25:30', 'wandb_project': 'llm_tutorial-0.2', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'save': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 7500, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 7500, 'train_samples': None, 'global_batch_size': 640, 'micro_batch_size': 5, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 131072, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/Qwen2-0.5b-0.2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 128}
13
+ 2024-08-23 16:25:43,197 INFO MainThread:11213 [wandb_init.py:init():616] starting backend
14
+ 2024-08-23 16:25:43,197 INFO MainThread:11213 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-23 16:25:43,201 INFO MainThread:11213 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-23 16:25:43,202 INFO MainThread:11213 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-23 16:25:43,207 INFO MainThread:11213 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-23 16:25:43,218 INFO MainThread:11213 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-23 16:25:43,667 INFO MainThread:11213 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-23 16:25:43,692 INFO MainThread:11213 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-23 16:25:43,692 INFO MainThread:11213 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-23 16:25:43,790 INFO MainThread:11213 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-23 16:25:43,790 INFO MainThread:11213 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-23 16:25:43,790 INFO MainThread:11213 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-23 16:25:43,790 INFO MainThread:11213 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-23 16:25:43,791 INFO MainThread:11213 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-23 16:25:51,139 INFO MainThread:11213 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
29
+ 2024-08-23 16:25:51,139 INFO MainThread:11213 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-23 16:25:57,685 WARNING MsgRouterThr:11213 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240823_162543-eroprw00/run-eroprw00.wandb ADDED
Binary file (18.1 kB). View file