Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- wandb/run-20240804_021032-cd2cg2ui/files/output.log +11 -0
- wandb/run-20240804_021032-cd2cg2ui/files/wandb-metadata.json +215 -0
- wandb/run-20240804_021032-cd2cg2ui/files/wandb-summary.json +1 -0
- wandb/run-20240804_021444-pk5j08lr/files/config.yaml +335 -0
- wandb/run-20240804_021444-pk5j08lr/files/output.log +103 -0
- wandb/run-20240804_021444-pk5j08lr/files/requirements.txt +271 -0
- wandb/run-20240804_021444-pk5j08lr/files/wandb-metadata.json +215 -0
- wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json +1 -0
- wandb/run-20240804_021444-pk5j08lr/logs/debug-internal.log +191 -0
- wandb/run-20240804_021444-pk5j08lr/logs/debug.log +30 -0
- wandb/run-20240804_021444-pk5j08lr/run-pk5j08lr.wandb +0 -0
- wandb/run-20240804_144007-dds6qqbt/files/config.yaml +335 -0
- wandb/run-20240804_144007-dds6qqbt/files/output.log +135 -0
- wandb/run-20240804_144007-dds6qqbt/files/requirements.txt +271 -0
- wandb/run-20240804_144007-dds6qqbt/files/wandb-metadata.json +215 -0
- wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json +1 -0
- wandb/run-20240804_144007-dds6qqbt/logs/debug-internal.log +186 -0
- wandb/run-20240804_144007-dds6qqbt/logs/debug.log +30 -0
- wandb/run-20240804_144007-dds6qqbt/run-dds6qqbt.wandb +0 -0
- wandb/run-20240804_222226-kh5katc1/files/config.yaml +335 -0
- wandb/run-20240804_222226-kh5katc1/files/output.log +468 -0
- wandb/run-20240804_222226-kh5katc1/files/requirements.txt +271 -0
- wandb/run-20240804_222226-kh5katc1/files/wandb-metadata.json +215 -0
- wandb/run-20240804_222226-kh5katc1/files/wandb-summary.json +1 -0
- wandb/run-20240804_222226-kh5katc1/logs/debug-internal.log +0 -0
- wandb/run-20240804_222226-kh5katc1/logs/debug.log +30 -0
- wandb/run-20240812_063447-whqmtxyq/files/config.yaml +335 -0
- wandb/run-20240812_063447-whqmtxyq/files/output.log +144 -0
- wandb/run-20240812_063447-whqmtxyq/files/requirements.txt +271 -0
- wandb/run-20240812_063447-whqmtxyq/files/wandb-metadata.json +215 -0
- wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json +1 -0
- wandb/run-20240812_063447-whqmtxyq/logs/debug-internal.log +359 -0
- wandb/run-20240812_063447-whqmtxyq/logs/debug.log +30 -0
- wandb/run-20240812_063447-whqmtxyq/run-whqmtxyq.wandb +0 -0
- wandb/run-20240815_031216-0szn78ph/files/config.yaml +335 -0
- wandb/run-20240815_031216-0szn78ph/files/output.log +92 -0
- wandb/run-20240815_031216-0szn78ph/files/requirements.txt +293 -0
- wandb/run-20240815_031216-0szn78ph/files/wandb-metadata.json +215 -0
- wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json +1 -0
- wandb/run-20240815_031216-0szn78ph/logs/debug-internal.log +260 -0
- wandb/run-20240815_031216-0szn78ph/logs/debug.log +29 -0
- wandb/run-20240815_031216-0szn78ph/run-0szn78ph.wandb +0 -0
- wandb/run-20240823_162543-eroprw00/files/config.yaml +342 -0
- wandb/run-20240823_162543-eroprw00/files/output.log +116 -0
- wandb/run-20240823_162543-eroprw00/files/requirements.txt +375 -0
- wandb/run-20240823_162543-eroprw00/files/wandb-metadata.json +220 -0
- wandb/run-20240823_162543-eroprw00/files/wandb-summary.json +1 -0
- wandb/run-20240823_162543-eroprw00/logs/debug-internal.log +188 -0
- wandb/run-20240823_162543-eroprw00/logs/debug.log +30 -0
- wandb/run-20240823_162543-eroprw00/run-eroprw00.wandb +0 -0
wandb/run-20240804_021032-cd2cg2ui/files/output.log
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/tiny-mistral-sample.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
Traceback (most recent call last):
|
5 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
6 |
+
main()
|
7 |
+
File "/project/src/llama_recipes/finetuning.py", line 103, in main
|
8 |
+
model = get_model(
|
9 |
+
File "/project/src/llama_recipes/get_models.py", line 71, in get_model
|
10 |
+
assert sliding_window == 4096
|
11 |
+
AssertionError
|
wandb/run-20240804_021032-cd2cg2ui/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-03T17:10:33.458421",
|
5 |
+
"startedAt": "2024-08-03T17:10:32.395506",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"1024",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"8192",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"8",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"Llama2Tokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
|
23 |
+
"--train-data-path",
|
24 |
+
"4013541",
|
25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"4013541",
|
28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"4013541",
|
31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"200",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/custom/tiny-mistral",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/tiny-mistral-sample",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/tiny-mistral-sample",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/tiny-mistral-sample",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"tiny-mistral-sample_train_2024-08-04-02:10:14"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.034,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.034,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.034,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.034,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.034,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.034,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.034,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.034,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.034,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.034,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.034,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.034,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.034,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.034,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.034,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.034,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.034,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.034,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.034,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48782730102539
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240804_021032-cd2cg2ui/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 0}}
|
wandb/run-20240804_021444-pk5j08lr/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '4013541'
|
31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '4013541'
|
36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '4013541'
|
41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 1024
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: Llama2Tokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: tiny-mistral-sample_train_2024-08-04-02:14:34
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/tiny-mistral-sample
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/tiny-mistral-sample
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/custom/tiny-mistral
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 200
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 8
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/tiny-mistral-sample
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 32768
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 40
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1722705284.714592
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
activation_function:
|
316 |
+
desc: null
|
317 |
+
value: silu
|
318 |
+
hidden_size:
|
319 |
+
desc: null
|
320 |
+
value: 256
|
321 |
+
model_type:
|
322 |
+
desc: null
|
323 |
+
value: mistral
|
324 |
+
max_position_embeddings:
|
325 |
+
desc: null
|
326 |
+
value: 1024
|
327 |
+
num_attention_heads:
|
328 |
+
desc: null
|
329 |
+
value: 4
|
330 |
+
num_hidden_layers:
|
331 |
+
desc: null
|
332 |
+
value: 4
|
333 |
+
model_architecture:
|
334 |
+
desc: null
|
335 |
+
value: MistralForCausalLM
|
wandb/run-20240804_021444-pk5j08lr/files/output.log
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/tiny-mistral-sample.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
5 |
+
Loading model state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
|
6 |
+
Loaded model state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
|
7 |
+
--> Model /share/pretrained_lm/custom/tiny-mistral
|
8 |
+
--> /share/pretrained_lm/custom/tiny-mistral has 19.925248 Million params
|
9 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
10 |
+
--> applying fsdp activation checkpointing...
|
11 |
+
> datasets target sizes (minimum size):
|
12 |
+
train: 6400000
|
13 |
+
validation: 323200
|
14 |
+
test: 3200
|
15 |
+
> building train, validation, and test datasets for GPT ...
|
16 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
17 |
+
warnings.warn(
|
18 |
+
Let split = None
|
19 |
+
> finished creating GPT datasets ...
|
20 |
+
Loading optimizer state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
|
21 |
+
Loaded optimizer state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
|
22 |
+
model info: FullyShardedDataParallel(
|
23 |
+
(_fsdp_wrapped_module): MistralForCausalLM(
|
24 |
+
(model): MistralModel(
|
25 |
+
(embed_tokens): Embedding(32768, 256)
|
26 |
+
(layers): ModuleList(
|
27 |
+
(0-3): 4 x FullyShardedDataParallel(
|
28 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
29 |
+
(_checkpoint_wrapped_module): MistralDecoderLayer(
|
30 |
+
(self_attn): MistralFlashAttention2(
|
31 |
+
(q_proj): Linear(in_features=256, out_features=512, bias=False)
|
32 |
+
(k_proj): Linear(in_features=256, out_features=256, bias=False)
|
33 |
+
(v_proj): Linear(in_features=256, out_features=256, bias=False)
|
34 |
+
(o_proj): Linear(in_features=512, out_features=256, bias=False)
|
35 |
+
(rotary_emb): MistralRotaryEmbedding()
|
36 |
+
)
|
37 |
+
(mlp): MistralMLP(
|
38 |
+
(gate_proj): Linear(in_features=256, out_features=512, bias=False)
|
39 |
+
(up_proj): Linear(in_features=256, out_features=512, bias=False)
|
40 |
+
(down_proj): Linear(in_features=512, out_features=256, bias=False)
|
41 |
+
(act_fn): SiLU()
|
42 |
+
)
|
43 |
+
(input_layernorm): MistralRMSNorm()
|
44 |
+
(post_attention_layernorm): MistralRMSNorm()
|
45 |
+
)
|
46 |
+
)
|
47 |
+
)
|
48 |
+
)
|
49 |
+
(norm): MistralRMSNorm()
|
50 |
+
)
|
51 |
+
(lm_head): Linear(in_features=256, out_features=32768, bias=False)
|
52 |
+
)
|
53 |
+
)
|
54 |
+
model config: MistralConfig {
|
55 |
+
"_name_or_path": "/share/pretrained_lm/custom/tiny-mistral",
|
56 |
+
"architectures": [
|
57 |
+
"MistralForCausalLM"
|
58 |
+
],
|
59 |
+
"attention_dropout": 0.0,
|
60 |
+
"bos_token_id": 1,
|
61 |
+
"eos_token_id": 2,
|
62 |
+
"head_dim": 128,
|
63 |
+
"hidden_act": "silu",
|
64 |
+
"hidden_size": 256,
|
65 |
+
"initializer_range": 0.02,
|
66 |
+
"intermediate_size": 512,
|
67 |
+
"label_smoothing": 0.0,
|
68 |
+
"max_position_embeddings": 1024,
|
69 |
+
"model_type": "mistral",
|
70 |
+
"num_attention_heads": 4,
|
71 |
+
"num_hidden_layers": 4,
|
72 |
+
"num_key_value_heads": 2,
|
73 |
+
"rms_norm_eps": 1e-05,
|
74 |
+
"rope_theta": 1000000.0,
|
75 |
+
"sliding_window": 4096,
|
76 |
+
"tie_word_embeddings": false,
|
77 |
+
"torch_dtype": "float32",
|
78 |
+
"transformers_version": "4.43.3",
|
79 |
+
"use_cache": false,
|
80 |
+
"vocab_size": 32768
|
81 |
+
}
|
82 |
+
Saving checkpoint to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000
|
83 |
+
Saving model state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
|
84 |
+
Saved model state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
|
85 |
+
Saving optimizer state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
|
86 |
+
Saved optimizer state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
|
87 |
+
Saving scheduler state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/scheduler.pt
|
88 |
+
Saved scheduler state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/scheduler.pt
|
89 |
+
Saving RNG states to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/rng.pt
|
90 |
+
Saved RNG states to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/rng.pt
|
91 |
+
Saved checkpoint to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000, took 0.17s
|
92 |
+
Building a BlendedDataset for a single MegatronDataset
|
93 |
+
Unable to save the indexes because path_to_cache is None
|
94 |
+
Building a BlendedDataset for a single MegatronDataset
|
95 |
+
Unable to save the indexes because path_to_cache is None
|
96 |
+
Building a BlendedDataset for a single MegatronDataset
|
97 |
+
Unable to save the indexes because path_to_cache is None
|
98 |
+
[rank0]:[2024-08-04 02:14:50,842] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
|
99 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
100 |
+
warnings.warn(
|
101 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
102 |
+
warnings.warn(
|
103 |
+
[rank0]:[2024-08-04 02:14:50,959] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.0010300300018570852, 'preprocessing_with_comm': 0.0005270100009511225, 'state_converting': 0.021121047997439746, <Type.ALL: 'all'>: 0.022993901999143418})
|
wandb/run-20240804_021444-pk5j08lr/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240804_021444-pk5j08lr/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-03T17:14:45.302596",
|
5 |
+
"startedAt": "2024-08-03T17:14:44.702200",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"1024",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"8",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"Llama2Tokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
|
23 |
+
"--train-data-path",
|
24 |
+
"4013541",
|
25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"4013541",
|
28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"4013541",
|
31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"200",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/custom/tiny-mistral",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/tiny-mistral-sample",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/tiny-mistral-sample",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/tiny-mistral-sample",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"tiny-mistral-sample_train_2024-08-04-02:14:34"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.034,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.034,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.034,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.034,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.034,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.034,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.034,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.034,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.034,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.034,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.034,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.034,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.034,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.034,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.034,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.034,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.034,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.034,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.034,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48782730102539
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 5}}
|
wandb/run-20240804_021444-pk5j08lr/logs/debug-internal.log
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 02:14:44,716 INFO StreamThr :11553 [internal.py:wandb_internal():86] W&B internal server running at pid: 11553, started at: 2024-08-04 02:14:44.715209
|
2 |
+
2024-08-04 02:14:44,717 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-04 02:14:44,719 INFO WriterThread:11553 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_021444-pk5j08lr/run-pk5j08lr.wandb
|
4 |
+
2024-08-04 02:14:44,720 DEBUG SenderThread:11553 [sender.py:send():382] send: header
|
5 |
+
2024-08-04 02:14:44,733 DEBUG SenderThread:11553 [sender.py:send():382] send: run
|
6 |
+
2024-08-04 02:14:45,190 INFO SenderThread:11553 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_021444-pk5j08lr/files
|
7 |
+
2024-08-04 02:14:45,190 INFO SenderThread:11553 [sender.py:_start_run_threads():1136] run started: pk5j08lr with start time 1722705284.714592
|
8 |
+
2024-08-04 02:14:45,195 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-04 02:14:45,195 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-04 02:14:45,280 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-04 02:14:45,286 DEBUG HandlerThread:11553 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-04 02:14:45,286 DEBUG HandlerThread:11553 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-04 02:14:45,286 INFO HandlerThread:11553 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-04 02:14:45,286 INFO SystemMonitor:11553 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-04 02:14:45,287 INFO HandlerThread:11553 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-04 02:14:45,287 INFO SystemMonitor:11553 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-04 02:14:45,288 INFO SystemMonitor:11553 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-04 02:14:45,289 INFO SystemMonitor:11553 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-04 02:14:45,289 INFO SystemMonitor:11553 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-04 02:14:45,290 INFO SystemMonitor:11553 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-04 02:14:45,302 DEBUG HandlerThread:11553 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-04 02:14:45,304 DEBUG HandlerThread:11553 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-04 02:14:45,315 DEBUG HandlerThread:11553 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-04 02:14:45,315 DEBUG HandlerThread:11553 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-04 02:14:45,315 DEBUG HandlerThread:11553 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-03T17:14:45.302596', 'startedAt': '2024-08-03T17:14:44.702200', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1024', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/custom/tiny-mistral', '--save', '/work/llm_recipes/models/tiny-mistral-sample', '--load', '/work/llm_recipes/models/tiny-mistral-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-mistral-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-mistral-sample_train_2024-08-04-02:14:34'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
|
26 |
+
2024-08-04 02:14:45,315 INFO HandlerThread:11553 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-04 02:14:45,316 INFO HandlerThread:11553 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-04 02:14:45,317 INFO HandlerThread:11553 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-04 02:14:45,323 DEBUG SenderThread:11553 [sender.py:send():382] send: files
|
30 |
+
2024-08-04 02:14:45,323 INFO SenderThread:11553 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-04 02:14:45,332 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-04 02:14:45,332 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-04 02:14:45,332 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: python_packages
|
34 |
+
2024-08-04 02:14:45,333 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: internal_messages
|
35 |
+
2024-08-04 02:14:45,334 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-04 02:14:45,580 DEBUG SenderThread:11553 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-04 02:14:46,067 INFO wandb-upload_0:11553 [upload_job.py:push():131] Uploaded file /tmp/tmp8oqwu4dewandb/gzg3ga4a-wandb-metadata.json
|
38 |
+
2024-08-04 02:14:46,191 INFO Thread-12 :11553 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-metadata.json
|
39 |
+
2024-08-04 02:14:46,192 INFO Thread-12 :11553 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
|
40 |
+
2024-08-04 02:14:46,192 INFO Thread-12 :11553 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021444-pk5j08lr/files/requirements.txt
|
41 |
+
2024-08-04 02:14:48,192 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
|
42 |
+
2024-08-04 02:14:49,192 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
|
43 |
+
2024-08-04 02:14:50,179 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: status_report
|
44 |
+
2024-08-04 02:14:50,193 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
|
45 |
+
2024-08-04 02:14:50,882 DEBUG SenderThread:11553 [sender.py:send():382] send: config
|
46 |
+
2024-08-04 02:14:50,882 DEBUG SenderThread:11553 [sender.py:send():382] send: config
|
47 |
+
2024-08-04 02:14:51,067 DEBUG SenderThread:11553 [sender.py:send():382] send: exit
|
48 |
+
2024-08-04 02:14:51,067 INFO SenderThread:11553 [sender.py:send_exit():589] handling exit code: 0
|
49 |
+
2024-08-04 02:14:51,067 INFO SenderThread:11553 [sender.py:send_exit():591] handling runtime: 5
|
50 |
+
2024-08-04 02:14:51,068 INFO SenderThread:11553 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
51 |
+
2024-08-04 02:14:51,068 INFO SenderThread:11553 [sender.py:send_exit():597] send defer
|
52 |
+
2024-08-04 02:14:51,068 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
53 |
+
2024-08-04 02:14:51,068 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 0
|
54 |
+
2024-08-04 02:14:51,069 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
55 |
+
2024-08-04 02:14:51,069 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 0
|
56 |
+
2024-08-04 02:14:51,069 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 1
|
57 |
+
2024-08-04 02:14:51,069 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
58 |
+
2024-08-04 02:14:51,069 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 1
|
59 |
+
2024-08-04 02:14:51,069 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
60 |
+
2024-08-04 02:14:51,069 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 1
|
61 |
+
2024-08-04 02:14:51,069 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 2
|
62 |
+
2024-08-04 02:14:51,069 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
63 |
+
2024-08-04 02:14:51,069 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 2
|
64 |
+
2024-08-04 02:14:51,069 INFO HandlerThread:11553 [system_monitor.py:finish():203] Stopping system monitor
|
65 |
+
2024-08-04 02:14:51,069 DEBUG SystemMonitor:11553 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
66 |
+
2024-08-04 02:14:51,070 INFO HandlerThread:11553 [interfaces.py:finish():202] Joined cpu monitor
|
67 |
+
2024-08-04 02:14:51,070 DEBUG SystemMonitor:11553 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
68 |
+
2024-08-04 02:14:51,070 INFO HandlerThread:11553 [interfaces.py:finish():202] Joined disk monitor
|
69 |
+
2024-08-04 02:14:51,070 DEBUG SystemMonitor:11553 [system_monitor.py:_start():183] Publishing last batch of metrics
|
70 |
+
2024-08-04 02:14:51,103 INFO HandlerThread:11553 [interfaces.py:finish():202] Joined gpu monitor
|
71 |
+
2024-08-04 02:14:51,103 INFO HandlerThread:11553 [interfaces.py:finish():202] Joined memory monitor
|
72 |
+
2024-08-04 02:14:51,103 INFO HandlerThread:11553 [interfaces.py:finish():202] Joined network monitor
|
73 |
+
2024-08-04 02:14:51,104 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
74 |
+
2024-08-04 02:14:51,104 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 2
|
75 |
+
2024-08-04 02:14:51,104 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 3
|
76 |
+
2024-08-04 02:14:51,104 DEBUG SenderThread:11553 [sender.py:send():382] send: stats
|
77 |
+
2024-08-04 02:14:51,104 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
78 |
+
2024-08-04 02:14:51,104 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 3
|
79 |
+
2024-08-04 02:14:51,104 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
80 |
+
2024-08-04 02:14:51,104 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 3
|
81 |
+
2024-08-04 02:14:51,105 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 4
|
82 |
+
2024-08-04 02:14:51,105 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
83 |
+
2024-08-04 02:14:51,105 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 4
|
84 |
+
2024-08-04 02:14:51,105 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
85 |
+
2024-08-04 02:14:51,105 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 4
|
86 |
+
2024-08-04 02:14:51,105 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 5
|
87 |
+
2024-08-04 02:14:51,105 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
88 |
+
2024-08-04 02:14:51,105 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 5
|
89 |
+
2024-08-04 02:14:51,105 DEBUG SenderThread:11553 [sender.py:send():382] send: summary
|
90 |
+
2024-08-04 02:14:51,106 INFO SenderThread:11553 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
91 |
+
2024-08-04 02:14:51,106 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
92 |
+
2024-08-04 02:14:51,106 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 5
|
93 |
+
2024-08-04 02:14:51,106 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 6
|
94 |
+
2024-08-04 02:14:51,106 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
95 |
+
2024-08-04 02:14:51,106 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 6
|
96 |
+
2024-08-04 02:14:51,107 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
97 |
+
2024-08-04 02:14:51,107 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 6
|
98 |
+
2024-08-04 02:14:51,109 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: status_report
|
99 |
+
2024-08-04 02:14:51,194 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
|
100 |
+
2024-08-04 02:14:51,194 INFO Thread-12 :11553 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json
|
101 |
+
2024-08-04 02:14:51,396 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 7
|
102 |
+
2024-08-04 02:14:51,396 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
103 |
+
2024-08-04 02:14:51,396 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 7
|
104 |
+
2024-08-04 02:14:51,396 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
105 |
+
2024-08-04 02:14:51,396 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 7
|
106 |
+
2024-08-04 02:14:52,066 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
|
107 |
+
2024-08-04 02:14:52,195 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/config.yaml
|
108 |
+
2024-08-04 02:14:52,195 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
|
109 |
+
2024-08-04 02:14:52,692 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 8
|
110 |
+
2024-08-04 02:14:52,692 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
|
111 |
+
2024-08-04 02:14:52,692 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
112 |
+
2024-08-04 02:14:52,692 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 8
|
113 |
+
2024-08-04 02:14:52,692 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
114 |
+
2024-08-04 02:14:52,693 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 8
|
115 |
+
2024-08-04 02:14:52,693 INFO SenderThread:11553 [job_builder.py:build():296] Attempting to build job artifact
|
116 |
+
2024-08-04 02:14:52,693 INFO SenderThread:11553 [job_builder.py:_get_source_type():426] is repo sourced job
|
117 |
+
2024-08-04 02:14:52,707 INFO SenderThread:11553 [job_builder.py:build():402] adding wandb-job metadata file
|
118 |
+
2024-08-04 02:14:52,715 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 9
|
119 |
+
2024-08-04 02:14:52,716 DEBUG SenderThread:11553 [sender.py:send():382] send: artifact
|
120 |
+
2024-08-04 02:14:52,716 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
121 |
+
2024-08-04 02:14:52,717 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 9
|
122 |
+
2024-08-04 02:14:53,067 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
|
123 |
+
2024-08-04 02:14:53,195 INFO Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
|
124 |
+
2024-08-04 02:14:53,655 INFO SenderThread:11553 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
|
125 |
+
2024-08-04 02:14:53,655 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
126 |
+
2024-08-04 02:14:53,655 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 9
|
127 |
+
2024-08-04 02:14:53,655 INFO SenderThread:11553 [dir_watcher.py:finish():358] shutting down directory watcher
|
128 |
+
2024-08-04 02:14:54,196 INFO SenderThread:11553 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_021444-pk5j08lr/files
|
129 |
+
2024-08-04 02:14:54,197 INFO SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/requirements.txt requirements.txt
|
130 |
+
2024-08-04 02:14:54,197 INFO SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/config.yaml config.yaml
|
131 |
+
2024-08-04 02:14:54,198 INFO SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-metadata.json wandb-metadata.json
|
132 |
+
2024-08-04 02:14:54,198 INFO SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json wandb-summary.json
|
133 |
+
2024-08-04 02:14:54,200 INFO SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log output.log
|
134 |
+
2024-08-04 02:14:54,200 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 10
|
135 |
+
2024-08-04 02:14:54,202 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
|
136 |
+
2024-08-04 02:14:54,202 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
137 |
+
2024-08-04 02:14:54,205 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 10
|
138 |
+
2024-08-04 02:14:54,206 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
139 |
+
2024-08-04 02:14:54,206 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 10
|
140 |
+
2024-08-04 02:14:54,206 INFO SenderThread:11553 [file_pusher.py:finish():172] shutting down file pusher
|
141 |
+
2024-08-04 02:14:54,605 INFO wandb-upload_1:11553 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021444-pk5j08lr/files/config.yaml
|
142 |
+
2024-08-04 02:14:54,711 INFO wandb-upload_0:11553 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021444-pk5j08lr/files/requirements.txt
|
143 |
+
2024-08-04 02:14:54,762 INFO wandb-upload_2:11553 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json
|
144 |
+
2024-08-04 02:14:54,792 INFO wandb-upload_3:11553 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
|
145 |
+
2024-08-04 02:14:54,992 INFO Thread-11 (_thread_body):11553 [sender.py:transition_state():617] send defer: 11
|
146 |
+
2024-08-04 02:14:54,992 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
147 |
+
2024-08-04 02:14:54,992 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 11
|
148 |
+
2024-08-04 02:14:54,992 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
149 |
+
2024-08-04 02:14:54,992 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 11
|
150 |
+
2024-08-04 02:14:54,993 INFO SenderThread:11553 [file_pusher.py:join():178] waiting for file pusher
|
151 |
+
2024-08-04 02:14:54,993 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 12
|
152 |
+
2024-08-04 02:14:54,993 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
153 |
+
2024-08-04 02:14:54,993 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 12
|
154 |
+
2024-08-04 02:14:54,993 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
155 |
+
2024-08-04 02:14:54,993 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 12
|
156 |
+
2024-08-04 02:14:54,993 INFO SenderThread:11553 [file_stream.py:finish():595] file stream finish called
|
157 |
+
2024-08-04 02:14:55,067 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
|
158 |
+
2024-08-04 02:14:55,176 INFO SenderThread:11553 [file_stream.py:finish():599] file stream finish is done
|
159 |
+
2024-08-04 02:14:55,176 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 13
|
160 |
+
2024-08-04 02:14:55,176 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
|
161 |
+
2024-08-04 02:14:55,176 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
162 |
+
2024-08-04 02:14:55,177 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 13
|
163 |
+
2024-08-04 02:14:55,177 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
164 |
+
2024-08-04 02:14:55,177 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 13
|
165 |
+
2024-08-04 02:14:55,177 INFO SenderThread:11553 [sender.py:transition_state():617] send defer: 14
|
166 |
+
2024-08-04 02:14:55,177 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
|
167 |
+
2024-08-04 02:14:55,177 DEBUG SenderThread:11553 [sender.py:send():382] send: final
|
168 |
+
2024-08-04 02:14:55,177 INFO HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 14
|
169 |
+
2024-08-04 02:14:55,177 DEBUG SenderThread:11553 [sender.py:send():382] send: footer
|
170 |
+
2024-08-04 02:14:55,178 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: defer
|
171 |
+
2024-08-04 02:14:55,178 INFO SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 14
|
172 |
+
2024-08-04 02:14:55,178 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
|
173 |
+
2024-08-04 02:14:55,178 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
|
174 |
+
2024-08-04 02:14:55,178 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
|
175 |
+
2024-08-04 02:14:55,179 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
|
176 |
+
2024-08-04 02:14:55,179 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: server_info
|
177 |
+
2024-08-04 02:14:55,179 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: server_info
|
178 |
+
2024-08-04 02:14:55,180 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: get_summary
|
179 |
+
2024-08-04 02:14:55,181 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: sampled_history
|
180 |
+
2024-08-04 02:14:55,181 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: internal_messages
|
181 |
+
2024-08-04 02:14:55,181 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: job_info
|
182 |
+
2024-08-04 02:14:55,346 DEBUG SenderThread:11553 [sender.py:send_request():409] send_request: job_info
|
183 |
+
2024-08-04 02:14:55,346 INFO MainThread:11553 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
184 |
+
2024-08-04 02:14:55,346 INFO MainThread:11553 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
185 |
+
2024-08-04 02:14:55,346 INFO MainThread:11553 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
186 |
+
2024-08-04 02:14:55,346 DEBUG HandlerThread:11553 [handler.py:handle_request():146] handle_request: shutdown
|
187 |
+
2024-08-04 02:14:55,346 INFO HandlerThread:11553 [handler.py:finish():869] shutting down handler
|
188 |
+
2024-08-04 02:14:56,181 INFO WriterThread:11553 [datastore.py:close():296] close: /project/wandb/run-20240804_021444-pk5j08lr/run-pk5j08lr.wandb
|
189 |
+
2024-08-04 02:14:56,346 INFO SenderThread:11553 [sender.py:finish():1572] shutting down sender
|
190 |
+
2024-08-04 02:14:56,346 INFO SenderThread:11553 [file_pusher.py:finish():172] shutting down file pusher
|
191 |
+
2024-08-04 02:14:56,346 INFO SenderThread:11553 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240804_021444-pk5j08lr/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Configure stats pid to 11482
|
3 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
|
6 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_021444-pk5j08lr/logs/debug.log
|
9 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_021444-pk5j08lr/logs/debug-internal.log
|
10 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1024, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample_train_2024-08-04-02:14:34', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample', 'save': '/work/llm_recipes/models/tiny-mistral-sample', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
|
13 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-04 02:14:44,708 INFO MainThread:11482 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-04 02:14:44,713 INFO MainThread:11482 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-04 02:14:44,714 INFO MainThread:11482 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-04 02:14:44,719 INFO MainThread:11482 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-04 02:14:44,729 INFO MainThread:11482 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-04 02:14:45,194 INFO MainThread:11482 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-04 02:14:45,273 INFO MainThread:11482 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-04 02:14:45,274 INFO MainThread:11482 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-04 02:14:45,331 INFO MainThread:11482 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-04 02:14:45,331 INFO MainThread:11482 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-04 02:14:45,332 INFO MainThread:11482 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-04 02:14:45,332 INFO MainThread:11482 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-04 02:14:45,333 INFO MainThread:11482 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-04 02:14:50,881 INFO MainThread:11482 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 1024, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
|
29 |
+
2024-08-04 02:14:50,881 INFO MainThread:11482 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
30 |
+
2024-08-04 02:14:56,347 WARNING MsgRouterThr:11482 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240804_021444-pk5j08lr/run-pk5j08lr.wandb
ADDED
Binary file (17.1 kB). View file
|
|
wandb/run-20240804_144007-dds6qqbt/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '4013541'
|
31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '4013541'
|
36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '4013541'
|
41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 512
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: Llama2Tokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: tiny-llama_train_2024-08-04-14:39:57
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/tiny-llama
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/tiny-llama
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 200
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 2000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 2000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 8
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/tiny-llama
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 32000
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 40
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1722750007.607754
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
activation_function:
|
316 |
+
desc: null
|
317 |
+
value: silu
|
318 |
+
hidden_size:
|
319 |
+
desc: null
|
320 |
+
value: 2048
|
321 |
+
model_type:
|
322 |
+
desc: null
|
323 |
+
value: llama
|
324 |
+
max_position_embeddings:
|
325 |
+
desc: null
|
326 |
+
value: 2048
|
327 |
+
num_attention_heads:
|
328 |
+
desc: null
|
329 |
+
value: 32
|
330 |
+
num_hidden_layers:
|
331 |
+
desc: null
|
332 |
+
value: 22
|
333 |
+
model_architecture:
|
334 |
+
desc: null
|
335 |
+
value: LlamaForCausalLM
|
wandb/run-20240804_144007-dds6qqbt/files/output.log
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/tiny-llama.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
5 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
6 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
7 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
8 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
9 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
10 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
|
11 |
+
--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
12 |
+
--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
|
13 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
14 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
15 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
16 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
17 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
18 |
+
warnings.warn(
|
19 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
20 |
+
--> applying fsdp activation checkpointing...
|
21 |
+
> datasets target sizes (minimum size):
|
22 |
+
train: 640000
|
23 |
+
validation: 35200
|
24 |
+
test: 3200
|
25 |
+
> building train, validation, and test datasets for GPT ...
|
26 |
+
> finished creating GPT datasets ...
|
27 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
28 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
29 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
|
30 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
31 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
32 |
+
model info: FullyShardedDataParallel(
|
33 |
+
(_fsdp_wrapped_module): LlamaForCausalLM(
|
34 |
+
(model): LlamaModel(
|
35 |
+
(embed_tokens): Embedding(32000, 2048)
|
36 |
+
(layers): ModuleList(
|
37 |
+
(0-21): 22 x FullyShardedDataParallel(
|
38 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
39 |
+
(_checkpoint_wrapped_module): LlamaDecoderLayer(
|
40 |
+
(self_attn): LlamaFlashAttention2(
|
41 |
+
(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
42 |
+
(k_proj): Linear(in_features=2048, out_features=256, bias=False)
|
43 |
+
(v_proj): Linear(in_features=2048, out_features=256, bias=False)
|
44 |
+
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
45 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
46 |
+
)
|
47 |
+
(mlp): LlamaMLP(
|
48 |
+
(gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
49 |
+
(up_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
50 |
+
(down_proj): Linear(in_features=5632, out_features=2048, bias=False)
|
51 |
+
(act_fn): SiLU()
|
52 |
+
)
|
53 |
+
(input_layernorm): LlamaRMSNorm()
|
54 |
+
(post_attention_layernorm): LlamaRMSNorm()
|
55 |
+
)
|
56 |
+
)
|
57 |
+
)
|
58 |
+
)
|
59 |
+
(norm): LlamaRMSNorm()
|
60 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
61 |
+
)
|
62 |
+
(lm_head): Linear(in_features=2048, out_features=32000, bias=False)
|
63 |
+
)
|
64 |
+
)
|
65 |
+
model config: LlamaConfig {
|
66 |
+
"_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
67 |
+
"architectures": [
|
68 |
+
"LlamaForCausalLM"
|
69 |
+
],
|
70 |
+
"attention_bias": false,
|
71 |
+
"attention_dropout": 0.0,
|
72 |
+
"bos_token_id": 1,
|
73 |
+
"eos_token_id": 2,
|
74 |
+
"hidden_act": "silu",
|
75 |
+
"hidden_size": 2048,
|
76 |
+
"initializer_range": 0.02,
|
77 |
+
"intermediate_size": 5632,
|
78 |
+
"label_smoothing": 0.0,
|
79 |
+
"max_position_embeddings": 2048,
|
80 |
+
"mlp_bias": false,
|
81 |
+
"model_type": "llama",
|
82 |
+
"num_attention_heads": 32,
|
83 |
+
"num_hidden_layers": 22,
|
84 |
+
"num_key_value_heads": 4,
|
85 |
+
"pretraining_tp": 1,
|
86 |
+
"rms_norm_eps": 1e-05,
|
87 |
+
"rope_scaling": null,
|
88 |
+
"rope_theta": 10000.0,
|
89 |
+
"tie_word_embeddings": false,
|
90 |
+
"torch_dtype": "float32",
|
91 |
+
"transformers_version": "4.43.3",
|
92 |
+
"use_cache": false,
|
93 |
+
"vocab_size": 32000
|
94 |
+
}
|
95 |
+
Let split = None
|
96 |
+
Building a BlendedDataset for a single MegatronDataset
|
97 |
+
Unable to save the indexes because path_to_cache is None
|
98 |
+
Building a BlendedDataset for a single MegatronDataset
|
99 |
+
Unable to save the indexes because path_to_cache is None
|
100 |
+
Building a BlendedDataset for a single MegatronDataset
|
101 |
+
Unable to save the indexes because path_to_cache is None
|
102 |
+
Traceback (most recent call last):
|
103 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
104 |
+
main()
|
105 |
+
File "/project/src/llama_recipes/finetuning.py", line 281, in main
|
106 |
+
train(
|
107 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
|
108 |
+
batch = next(train_dataloader)
|
109 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
|
110 |
+
for x in iter:
|
111 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
|
112 |
+
data = self._next_data()
|
113 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
|
114 |
+
return self._process_data(data)
|
115 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
|
116 |
+
data.reraise()
|
117 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
|
118 |
+
raise exception
|
119 |
+
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
|
120 |
+
Original Traceback (most recent call last):
|
121 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
|
122 |
+
data = fetcher.fetch(index)
|
123 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
|
124 |
+
return self.collate_fn(data)
|
125 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
|
126 |
+
return collate(batch, collate_fn_map=default_collate_fn_map)
|
127 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
|
128 |
+
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
|
129 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
|
130 |
+
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
|
131 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
|
132 |
+
return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
|
133 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
|
134 |
+
return torch.stack(batch, 0, out=out)
|
135 |
+
RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1
|
wandb/run-20240804_144007-dds6qqbt/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240804_144007-dds6qqbt/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-04T05:40:08.224323",
|
5 |
+
"startedAt": "2024-08-04T05:40:07.595226",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"512",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"8",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"2000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"Llama2Tokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
|
23 |
+
"--train-data-path",
|
24 |
+
"4013541",
|
25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"4013541",
|
28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"4013541",
|
31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"2000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"200",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/tiny-llama",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/tiny-llama",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/tiny-llama",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"tiny-llama_train_2024-08-04-14:39:57"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.0389999999993,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.039,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.039,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.039,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.039,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.039,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.039,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.039,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.039,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.039,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.039,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.039,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.039,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.039,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.039,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.039,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.039,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.039,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.039,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48781967163086
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 2}}
|
wandb/run-20240804_144007-dds6qqbt/logs/debug-internal.log
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 14:40:07,609 INFO StreamThr :11999 [internal.py:wandb_internal():86] W&B internal server running at pid: 11999, started at: 2024-08-04 14:40:07.608480
|
2 |
+
2024-08-04 14:40:07,610 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-04 14:40:07,612 INFO WriterThread:11999 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_144007-dds6qqbt/run-dds6qqbt.wandb
|
4 |
+
2024-08-04 14:40:07,613 DEBUG SenderThread:11999 [sender.py:send():382] send: header
|
5 |
+
2024-08-04 14:40:07,627 DEBUG SenderThread:11999 [sender.py:send():382] send: run
|
6 |
+
2024-08-04 14:40:08,110 INFO SenderThread:11999 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_144007-dds6qqbt/files
|
7 |
+
2024-08-04 14:40:08,111 INFO SenderThread:11999 [sender.py:_start_run_threads():1136] run started: dds6qqbt with start time 1722750007.607754
|
8 |
+
2024-08-04 14:40:08,116 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-04 14:40:08,116 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-04 14:40:08,204 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-04 14:40:08,210 DEBUG HandlerThread:11999 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-04 14:40:08,210 DEBUG HandlerThread:11999 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-04 14:40:08,211 INFO HandlerThread:11999 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-04 14:40:08,211 INFO SystemMonitor:11999 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-04 14:40:08,211 INFO HandlerThread:11999 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-04 14:40:08,211 INFO SystemMonitor:11999 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-04 14:40:08,212 INFO SystemMonitor:11999 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-04 14:40:08,213 INFO SystemMonitor:11999 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-04 14:40:08,214 INFO SystemMonitor:11999 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-04 14:40:08,214 INFO SystemMonitor:11999 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-04 14:40:08,224 DEBUG HandlerThread:11999 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-04 14:40:08,226 DEBUG HandlerThread:11999 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-04 14:40:08,238 DEBUG HandlerThread:11999 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-04 14:40:08,238 DEBUG HandlerThread:11999 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-04 14:40:08,238 DEBUG HandlerThread:11999 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:40:08.224323', 'startedAt': '2024-08-04T05:40:07.595226', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:39:57'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
|
26 |
+
2024-08-04 14:40:08,238 INFO HandlerThread:11999 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-04 14:40:08,238 INFO HandlerThread:11999 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-04 14:40:08,239 INFO HandlerThread:11999 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-04 14:40:08,245 DEBUG SenderThread:11999 [sender.py:send():382] send: files
|
30 |
+
2024-08-04 14:40:08,246 INFO SenderThread:11999 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-04 14:40:08,255 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-04 14:40:08,255 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: stop_status
|
33 |
+
2024-08-04 14:40:08,255 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: internal_messages
|
34 |
+
2024-08-04 14:40:08,255 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: python_packages
|
35 |
+
2024-08-04 14:40:08,257 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-04 14:40:08,521 DEBUG SenderThread:11999 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-04 14:40:08,889 INFO wandb-upload_0:11999 [upload_job.py:push():131] Uploaded file /tmp/tmp5bbx13axwandb/8bl0rtdu-wandb-metadata.json
|
38 |
+
2024-08-04 14:40:09,112 INFO Thread-12 :11999 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_144007-dds6qqbt/files/requirements.txt
|
39 |
+
2024-08-04 14:40:09,113 INFO Thread-12 :11999 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_144007-dds6qqbt/files/output.log
|
40 |
+
2024-08-04 14:40:09,113 INFO Thread-12 :11999 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-metadata.json
|
41 |
+
2024-08-04 14:40:10,756 DEBUG SenderThread:11999 [sender.py:send():382] send: config
|
42 |
+
2024-08-04 14:40:10,756 DEBUG SenderThread:11999 [sender.py:send():382] send: config
|
43 |
+
2024-08-04 14:40:10,842 DEBUG SenderThread:11999 [sender.py:send():382] send: exit
|
44 |
+
2024-08-04 14:40:10,842 INFO SenderThread:11999 [sender.py:send_exit():589] handling exit code: 1
|
45 |
+
2024-08-04 14:40:10,842 INFO SenderThread:11999 [sender.py:send_exit():591] handling runtime: 2
|
46 |
+
2024-08-04 14:40:10,843 INFO SenderThread:11999 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
47 |
+
2024-08-04 14:40:10,843 INFO SenderThread:11999 [sender.py:send_exit():597] send defer
|
48 |
+
2024-08-04 14:40:10,844 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
49 |
+
2024-08-04 14:40:10,844 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 0
|
50 |
+
2024-08-04 14:40:10,844 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
51 |
+
2024-08-04 14:40:10,844 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 0
|
52 |
+
2024-08-04 14:40:10,844 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 1
|
53 |
+
2024-08-04 14:40:10,844 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
54 |
+
2024-08-04 14:40:10,844 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 1
|
55 |
+
2024-08-04 14:40:10,844 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
56 |
+
2024-08-04 14:40:10,844 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 1
|
57 |
+
2024-08-04 14:40:10,844 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 2
|
58 |
+
2024-08-04 14:40:10,844 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
59 |
+
2024-08-04 14:40:10,844 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 2
|
60 |
+
2024-08-04 14:40:10,844 INFO HandlerThread:11999 [system_monitor.py:finish():203] Stopping system monitor
|
61 |
+
2024-08-04 14:40:10,845 DEBUG SystemMonitor:11999 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
62 |
+
2024-08-04 14:40:10,845 INFO HandlerThread:11999 [interfaces.py:finish():202] Joined cpu monitor
|
63 |
+
2024-08-04 14:40:10,845 DEBUG SystemMonitor:11999 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
64 |
+
2024-08-04 14:40:10,845 INFO HandlerThread:11999 [interfaces.py:finish():202] Joined disk monitor
|
65 |
+
2024-08-04 14:40:10,845 DEBUG SystemMonitor:11999 [system_monitor.py:_start():183] Publishing last batch of metrics
|
66 |
+
2024-08-04 14:40:10,878 INFO HandlerThread:11999 [interfaces.py:finish():202] Joined gpu monitor
|
67 |
+
2024-08-04 14:40:10,878 INFO HandlerThread:11999 [interfaces.py:finish():202] Joined memory monitor
|
68 |
+
2024-08-04 14:40:10,878 INFO HandlerThread:11999 [interfaces.py:finish():202] Joined network monitor
|
69 |
+
2024-08-04 14:40:10,878 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
70 |
+
2024-08-04 14:40:10,878 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 2
|
71 |
+
2024-08-04 14:40:10,878 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 3
|
72 |
+
2024-08-04 14:40:10,879 DEBUG SenderThread:11999 [sender.py:send():382] send: stats
|
73 |
+
2024-08-04 14:40:10,879 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
74 |
+
2024-08-04 14:40:10,879 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 3
|
75 |
+
2024-08-04 14:40:10,879 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
76 |
+
2024-08-04 14:40:10,879 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 3
|
77 |
+
2024-08-04 14:40:10,879 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 4
|
78 |
+
2024-08-04 14:40:10,879 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
79 |
+
2024-08-04 14:40:10,879 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 4
|
80 |
+
2024-08-04 14:40:10,879 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
81 |
+
2024-08-04 14:40:10,879 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 4
|
82 |
+
2024-08-04 14:40:10,879 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 5
|
83 |
+
2024-08-04 14:40:10,879 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
84 |
+
2024-08-04 14:40:10,880 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 5
|
85 |
+
2024-08-04 14:40:10,880 DEBUG SenderThread:11999 [sender.py:send():382] send: summary
|
86 |
+
2024-08-04 14:40:10,881 INFO SenderThread:11999 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
87 |
+
2024-08-04 14:40:10,881 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
88 |
+
2024-08-04 14:40:10,881 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 5
|
89 |
+
2024-08-04 14:40:10,881 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 6
|
90 |
+
2024-08-04 14:40:10,881 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
91 |
+
2024-08-04 14:40:10,881 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 6
|
92 |
+
2024-08-04 14:40:10,881 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
93 |
+
2024-08-04 14:40:10,881 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 6
|
94 |
+
2024-08-04 14:40:10,884 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: status_report
|
95 |
+
2024-08-04 14:40:11,083 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 7
|
96 |
+
2024-08-04 14:40:11,083 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
97 |
+
2024-08-04 14:40:11,083 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 7
|
98 |
+
2024-08-04 14:40:11,083 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
99 |
+
2024-08-04 14:40:11,083 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 7
|
100 |
+
2024-08-04 14:40:11,113 INFO Thread-12 :11999 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_144007-dds6qqbt/files/output.log
|
101 |
+
2024-08-04 14:40:11,114 INFO Thread-12 :11999 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_144007-dds6qqbt/files/config.yaml
|
102 |
+
2024-08-04 14:40:11,114 INFO Thread-12 :11999 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json
|
103 |
+
2024-08-04 14:40:11,842 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
|
104 |
+
2024-08-04 14:40:12,953 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 8
|
105 |
+
2024-08-04 14:40:12,953 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
|
106 |
+
2024-08-04 14:40:12,953 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
107 |
+
2024-08-04 14:40:12,954 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 8
|
108 |
+
2024-08-04 14:40:12,954 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
109 |
+
2024-08-04 14:40:12,954 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 8
|
110 |
+
2024-08-04 14:40:12,954 INFO SenderThread:11999 [job_builder.py:build():296] Attempting to build job artifact
|
111 |
+
2024-08-04 14:40:12,955 INFO SenderThread:11999 [job_builder.py:_get_source_type():426] is repo sourced job
|
112 |
+
2024-08-04 14:40:12,969 INFO SenderThread:11999 [job_builder.py:build():402] adding wandb-job metadata file
|
113 |
+
2024-08-04 14:40:12,987 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 9
|
114 |
+
2024-08-04 14:40:12,987 DEBUG SenderThread:11999 [sender.py:send():382] send: artifact
|
115 |
+
2024-08-04 14:40:12,988 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
116 |
+
2024-08-04 14:40:12,989 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 9
|
117 |
+
2024-08-04 14:40:13,115 INFO Thread-12 :11999 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_144007-dds6qqbt/files/output.log
|
118 |
+
2024-08-04 14:40:13,842 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
|
119 |
+
2024-08-04 14:40:13,848 INFO SenderThread:11999 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
|
120 |
+
2024-08-04 14:40:13,848 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
121 |
+
2024-08-04 14:40:13,848 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 9
|
122 |
+
2024-08-04 14:40:13,848 INFO SenderThread:11999 [dir_watcher.py:finish():358] shutting down directory watcher
|
123 |
+
2024-08-04 14:40:14,116 INFO SenderThread:11999 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_144007-dds6qqbt/files
|
124 |
+
2024-08-04 14:40:14,116 INFO SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/requirements.txt requirements.txt
|
125 |
+
2024-08-04 14:40:14,116 INFO SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/config.yaml config.yaml
|
126 |
+
2024-08-04 14:40:14,118 INFO SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-metadata.json wandb-metadata.json
|
127 |
+
2024-08-04 14:40:14,118 INFO SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json wandb-summary.json
|
128 |
+
2024-08-04 14:40:14,119 INFO SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/output.log output.log
|
129 |
+
2024-08-04 14:40:14,121 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 10
|
130 |
+
2024-08-04 14:40:14,121 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
|
131 |
+
2024-08-04 14:40:14,121 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
132 |
+
2024-08-04 14:40:14,121 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 10
|
133 |
+
2024-08-04 14:40:14,123 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
134 |
+
2024-08-04 14:40:14,123 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 10
|
135 |
+
2024-08-04 14:40:14,123 INFO SenderThread:11999 [file_pusher.py:finish():172] shutting down file pusher
|
136 |
+
2024-08-04 14:40:14,515 INFO wandb-upload_0:11999 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_144007-dds6qqbt/files/requirements.txt
|
137 |
+
2024-08-04 14:40:14,617 INFO wandb-upload_1:11999 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_144007-dds6qqbt/files/config.yaml
|
138 |
+
2024-08-04 14:40:14,698 INFO wandb-upload_2:11999 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json
|
139 |
+
2024-08-04 14:40:14,843 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
|
140 |
+
2024-08-04 14:40:14,843 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
|
141 |
+
2024-08-04 14:40:15,184 INFO wandb-upload_3:11999 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_144007-dds6qqbt/files/output.log
|
142 |
+
2024-08-04 14:40:15,384 INFO Thread-11 (_thread_body):11999 [sender.py:transition_state():617] send defer: 11
|
143 |
+
2024-08-04 14:40:15,384 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
144 |
+
2024-08-04 14:40:15,385 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 11
|
145 |
+
2024-08-04 14:40:15,385 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
146 |
+
2024-08-04 14:40:15,385 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 11
|
147 |
+
2024-08-04 14:40:15,385 INFO SenderThread:11999 [file_pusher.py:join():178] waiting for file pusher
|
148 |
+
2024-08-04 14:40:15,385 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 12
|
149 |
+
2024-08-04 14:40:15,385 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
150 |
+
2024-08-04 14:40:15,385 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 12
|
151 |
+
2024-08-04 14:40:15,385 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
152 |
+
2024-08-04 14:40:15,386 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 12
|
153 |
+
2024-08-04 14:40:15,386 INFO SenderThread:11999 [file_stream.py:finish():595] file stream finish called
|
154 |
+
2024-08-04 14:40:15,573 INFO SenderThread:11999 [file_stream.py:finish():599] file stream finish is done
|
155 |
+
2024-08-04 14:40:15,573 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 13
|
156 |
+
2024-08-04 14:40:15,573 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
157 |
+
2024-08-04 14:40:15,573 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 13
|
158 |
+
2024-08-04 14:40:15,573 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
159 |
+
2024-08-04 14:40:15,574 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 13
|
160 |
+
2024-08-04 14:40:15,574 INFO SenderThread:11999 [sender.py:transition_state():617] send defer: 14
|
161 |
+
2024-08-04 14:40:15,574 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
|
162 |
+
2024-08-04 14:40:15,574 DEBUG SenderThread:11999 [sender.py:send():382] send: final
|
163 |
+
2024-08-04 14:40:15,574 INFO HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 14
|
164 |
+
2024-08-04 14:40:15,574 DEBUG SenderThread:11999 [sender.py:send():382] send: footer
|
165 |
+
2024-08-04 14:40:15,574 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: defer
|
166 |
+
2024-08-04 14:40:15,574 INFO SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 14
|
167 |
+
2024-08-04 14:40:15,575 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
|
168 |
+
2024-08-04 14:40:15,575 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
|
169 |
+
2024-08-04 14:40:15,575 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
|
170 |
+
2024-08-04 14:40:15,576 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
|
171 |
+
2024-08-04 14:40:15,576 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: server_info
|
172 |
+
2024-08-04 14:40:15,576 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: get_summary
|
173 |
+
2024-08-04 14:40:15,576 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: server_info
|
174 |
+
2024-08-04 14:40:15,578 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: sampled_history
|
175 |
+
2024-08-04 14:40:15,578 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: internal_messages
|
176 |
+
2024-08-04 14:40:15,578 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: job_info
|
177 |
+
2024-08-04 14:40:15,734 DEBUG SenderThread:11999 [sender.py:send_request():409] send_request: job_info
|
178 |
+
2024-08-04 14:40:15,735 INFO MainThread:11999 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
179 |
+
2024-08-04 14:40:15,735 INFO MainThread:11999 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
180 |
+
2024-08-04 14:40:15,735 INFO MainThread:11999 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
181 |
+
2024-08-04 14:40:15,735 DEBUG HandlerThread:11999 [handler.py:handle_request():146] handle_request: shutdown
|
182 |
+
2024-08-04 14:40:15,735 INFO HandlerThread:11999 [handler.py:finish():869] shutting down handler
|
183 |
+
2024-08-04 14:40:16,578 INFO WriterThread:11999 [datastore.py:close():296] close: /project/wandb/run-20240804_144007-dds6qqbt/run-dds6qqbt.wandb
|
184 |
+
2024-08-04 14:40:16,735 INFO SenderThread:11999 [sender.py:finish():1572] shutting down sender
|
185 |
+
2024-08-04 14:40:16,735 INFO SenderThread:11999 [file_pusher.py:finish():172] shutting down file pusher
|
186 |
+
2024-08-04 14:40:16,735 INFO SenderThread:11999 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240804_144007-dds6qqbt/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 14:40:07,600 INFO MainThread:11928 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_setup.py:_flush():76] Configure stats pid to 11928
|
3 |
+
2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
|
6 |
+
2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_144007-dds6qqbt/logs/debug.log
|
9 |
+
2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_144007-dds6qqbt/logs/debug-internal.log
|
10 |
+
2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-04 14:40:07,601 INFO MainThread:11928 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:39:57', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
|
13 |
+
2024-08-04 14:40:07,602 INFO MainThread:11928 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-04 14:40:07,602 INFO MainThread:11928 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-04 14:40:07,606 INFO MainThread:11928 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-04 14:40:07,607 INFO MainThread:11928 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-04 14:40:07,612 INFO MainThread:11928 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-04 14:40:07,623 INFO MainThread:11928 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-04 14:40:08,115 INFO MainThread:11928 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-04 14:40:08,197 INFO MainThread:11928 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-04 14:40:08,197 INFO MainThread:11928 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-04 14:40:08,254 INFO MainThread:11928 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-04 14:40:08,254 INFO MainThread:11928 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-04 14:40:08,254 INFO MainThread:11928 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-04 14:40:08,255 INFO MainThread:11928 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-04 14:40:08,255 INFO MainThread:11928 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-04 14:40:10,755 INFO MainThread:11928 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
|
29 |
+
2024-08-04 14:40:10,755 INFO MainThread:11928 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
30 |
+
2024-08-04 14:40:16,736 WARNING MsgRouterThr:11928 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240804_144007-dds6qqbt/run-dds6qqbt.wandb
ADDED
Binary file (20.5 kB). View file
|
|
wandb/run-20240804_222226-kh5katc1/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '235289369'
|
31 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '235289369'
|
36 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '235289369'
|
41 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 4096
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: HFPreTrainedTokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/google/gemma-2-2b
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: yans-sample-gemma-2-2b_train_2024-08-04-22:22:15
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/yans-sample-gemma-2-2b
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/yans-sample-gemma-2-2b
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/google/gemma-2-2b
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 200
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 200
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: anyprecision
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 1
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/yans-sample-gemma-2-2b
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 256000
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 320
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1722777746.267116
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
activation_function:
|
316 |
+
desc: null
|
317 |
+
value: gelu_pytorch_tanh
|
318 |
+
hidden_size:
|
319 |
+
desc: null
|
320 |
+
value: 2304
|
321 |
+
model_type:
|
322 |
+
desc: null
|
323 |
+
value: gemma2
|
324 |
+
max_position_embeddings:
|
325 |
+
desc: null
|
326 |
+
value: 4096
|
327 |
+
num_attention_heads:
|
328 |
+
desc: null
|
329 |
+
value: 8
|
330 |
+
num_hidden_layers:
|
331 |
+
desc: null
|
332 |
+
value: 26
|
333 |
+
model_architecture:
|
334 |
+
desc: null
|
335 |
+
value: Gemma2ForCausalLM
|
wandb/run-20240804_222226-kh5katc1/files/output.log
ADDED
@@ -0,0 +1,468 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/yans-sample-gemma-2-2b.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
5 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
6 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
7 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
8 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
9 |
+
|
10 |
+
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:03<00:01, 1.62s/it]
|
11 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
12 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
13 |
+
No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping model loading
|
14 |
+
--> Model /share/pretrained_lm/google/gemma-2-2b
|
15 |
+
--> /share/pretrained_lm/google/gemma-2-2b has 2614.341888 Million params
|
16 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
17 |
+
--> applying fsdp activation checkpointing...
|
18 |
+
> datasets target sizes (minimum size):
|
19 |
+
train: 6400000
|
20 |
+
validation: 323200
|
21 |
+
test: 3200
|
22 |
+
> building train, validation, and test datasets for GPT ...
|
23 |
+
> finished creating GPT datasets ...
|
24 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
25 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
26 |
+
No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping optimizer loading
|
27 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
28 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
29 |
+
model info: FullyShardedDataParallel(
|
30 |
+
(_fsdp_wrapped_module): Gemma2ForCausalLM(
|
31 |
+
(model): Gemma2Model(
|
32 |
+
(embed_tokens): Embedding(256000, 2304, padding_idx=0)
|
33 |
+
(layers): ModuleList(
|
34 |
+
(0-25): 26 x FullyShardedDataParallel(
|
35 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
36 |
+
(_checkpoint_wrapped_module): Gemma2DecoderLayer(
|
37 |
+
(self_attn): Gemma2FlashAttention2(
|
38 |
+
(q_proj): Linear(in_features=2304, out_features=2048, bias=False)
|
39 |
+
(k_proj): Linear(in_features=2304, out_features=1024, bias=False)
|
40 |
+
(v_proj): Linear(in_features=2304, out_features=1024, bias=False)
|
41 |
+
(o_proj): Linear(in_features=2048, out_features=2304, bias=False)
|
42 |
+
(rotary_emb): Gemma2RotaryEmbedding()
|
43 |
+
)
|
44 |
+
(mlp): Gemma2MLP(
|
45 |
+
(gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
|
46 |
+
(up_proj): Linear(in_features=2304, out_features=9216, bias=False)
|
47 |
+
(down_proj): Linear(in_features=9216, out_features=2304, bias=False)
|
48 |
+
(act_fn): PytorchGELUTanh()
|
49 |
+
)
|
50 |
+
(input_layernorm): Gemma2RMSNorm()
|
51 |
+
(post_attention_layernorm): Gemma2RMSNorm()
|
52 |
+
(pre_feedforward_layernorm): Gemma2RMSNorm()
|
53 |
+
(post_feedforward_layernorm): Gemma2RMSNorm()
|
54 |
+
)
|
55 |
+
)
|
56 |
+
)
|
57 |
+
)
|
58 |
+
(norm): Gemma2RMSNorm()
|
59 |
+
)
|
60 |
+
(lm_head): Linear(in_features=2304, out_features=256000, bias=False)
|
61 |
+
)
|
62 |
+
)
|
63 |
+
model config: Gemma2Config {
|
64 |
+
"_name_or_path": "/share/pretrained_lm/google/gemma-2-2b",
|
65 |
+
"architectures": [
|
66 |
+
"Gemma2ForCausalLM"
|
67 |
+
],
|
68 |
+
"attention_bias": false,
|
69 |
+
"attention_dropout": 0.0,
|
70 |
+
"attn_logit_softcapping": 50.0,
|
71 |
+
"bos_token_id": 2,
|
72 |
+
"cache_implementation": "hybrid",
|
73 |
+
"eos_token_id": 1,
|
74 |
+
"final_logit_softcapping": 30.0,
|
75 |
+
"head_dim": 256,
|
76 |
+
"hidden_act": "gelu_pytorch_tanh",
|
77 |
+
"hidden_activation": "gelu_pytorch_tanh",
|
78 |
+
"hidden_size": 2304,
|
79 |
+
"initializer_range": 0.02,
|
80 |
+
"intermediate_size": 9216,
|
81 |
+
"label_smoothing": 0.0,
|
82 |
+
"max_position_embeddings": 4096,
|
83 |
+
"model_type": "gemma2",
|
84 |
+
"num_attention_heads": 8,
|
85 |
+
"num_hidden_layers": 26,
|
86 |
+
"num_key_value_heads": 4,
|
87 |
+
"pad_token_id": 0,
|
88 |
+
"query_pre_attn_scalar": 256,
|
89 |
+
"rms_norm_eps": 1e-06,
|
90 |
+
"rope_theta": 10000.0,
|
91 |
+
"sliding_window": 4096,
|
92 |
+
"torch_dtype": "float32",
|
93 |
+
"transformers_version": "4.43.3",
|
94 |
+
"use_cache": false,
|
95 |
+
"vocab_size": 256000
|
96 |
+
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00, 1.16s/it]
|
97 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
98 |
+
warnings.warn(
|
99 |
+
Let split = None
|
100 |
+
Building a BlendedDataset for a single MegatronDataset
|
101 |
+
Unable to save the indexes because path_to_cache is None
|
102 |
+
Building a BlendedDataset for a single MegatronDataset
|
103 |
+
Unable to save the indexes because path_to_cache is None
|
104 |
+
Building a BlendedDataset for a single MegatronDataset
|
105 |
+
Unable to save the indexes because path_to_cache is None
|
106 |
+
It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
|
107 |
+
------------------------------------------------------------------
|
108 |
+
iteration: 1 , TFLOPS: 86.75197547568487, Tokens per sec: 5563.411303067021, Loss: 4.171908378601074
|
109 |
+
------------------------------------------------------------------
|
110 |
+
------------------------------------------------------------------
|
111 |
+
iteration: 2 , TFLOPS: 66.89933870537911, Tokens per sec: 4290.26007857923, Loss: 4.01677942276001
|
112 |
+
------------------------------------------------------------------
|
113 |
+
------------------------------------------------------------------
|
114 |
+
iteration: 3 , TFLOPS: 67.16726893555325, Tokens per sec: 4307.442466217215, Loss: 3.9401252269744873
|
115 |
+
------------------------------------------------------------------
|
116 |
+
------------------------------------------------------------------
|
117 |
+
iteration: 4 , TFLOPS: 67.25290490347041, Tokens per sec: 4312.934307864013, Loss: 3.754024028778076
|
118 |
+
------------------------------------------------------------------
|
119 |
+
------------------------------------------------------------------
|
120 |
+
iteration: 5 , TFLOPS: 67.291445985822, Tokens per sec: 4315.405950636545, Loss: 3.8183631896972656
|
121 |
+
------------------------------------------------------------------
|
122 |
+
------------------------------------------------------------------
|
123 |
+
iteration: 6 , TFLOPS: 67.19993814817916, Tokens per sec: 4309.537545502599, Loss: 3.913503408432007
|
124 |
+
------------------------------------------------------------------
|
125 |
+
------------------------------------------------------------------
|
126 |
+
iteration: 7 , TFLOPS: 67.30122810400093, Tokens per sec: 4316.033278677735, Loss: 3.851064682006836
|
127 |
+
------------------------------------------------------------------
|
128 |
+
------------------------------------------------------------------
|
129 |
+
iteration: 8 , TFLOPS: 67.16795653479824, Tokens per sec: 4307.486562013197, Loss: 3.6646127700805664
|
130 |
+
------------------------------------------------------------------
|
131 |
+
------------------------------------------------------------------
|
132 |
+
iteration: 9 , TFLOPS: 67.23016958415664, Tokens per sec: 4311.4762899715615, Loss: 3.7966654300689697
|
133 |
+
------------------------------------------------------------------
|
134 |
+
------------------------------------------------------------------
|
135 |
+
iteration: 10 , TFLOPS: 67.23271391538408, Tokens per sec: 4311.639458141876, Loss: 3.5526936054229736
|
136 |
+
------------------------------------------------------------------
|
137 |
+
------------------------------------------------------------------
|
138 |
+
iteration: 11 , TFLOPS: 67.17798338980677, Tokens per sec: 4308.129585047344, Loss: 3.6002132892608643
|
139 |
+
------------------------------------------------------------------
|
140 |
+
------------------------------------------------------------------
|
141 |
+
iteration: 12 , TFLOPS: 67.30360350705875, Tokens per sec: 4316.185613470676, Loss: 3.5705204010009766
|
142 |
+
------------------------------------------------------------------
|
143 |
+
------------------------------------------------------------------
|
144 |
+
iteration: 13 , TFLOPS: 67.13811947997524, Tokens per sec: 4305.573109240019, Loss: 3.5447990894317627
|
145 |
+
------------------------------------------------------------------
|
146 |
+
------------------------------------------------------------------
|
147 |
+
iteration: 14 , TFLOPS: 67.15854019228757, Tokens per sec: 4306.882691195614, Loss: 3.450416088104248
|
148 |
+
------------------------------------------------------------------
|
149 |
+
------------------------------------------------------------------
|
150 |
+
iteration: 15 , TFLOPS: 67.19845754951105, Tokens per sec: 4309.442594588568, Loss: 3.443570613861084
|
151 |
+
------------------------------------------------------------------
|
152 |
+
------------------------------------------------------------------
|
153 |
+
iteration: 16 , TFLOPS: 67.23455541812397, Tokens per sec: 4311.757553863634, Loss: 3.3366641998291016
|
154 |
+
------------------------------------------------------------------
|
155 |
+
------------------------------------------------------------------
|
156 |
+
iteration: 17 , TFLOPS: 67.30688001895524, Tokens per sec: 4316.395736447352, Loss: 3.332282066345215
|
157 |
+
------------------------------------------------------------------
|
158 |
+
------------------------------------------------------------------
|
159 |
+
iteration: 18 , TFLOPS: 67.36120902746241, Tokens per sec: 4319.879860219242, Loss: 3.34403395652771
|
160 |
+
------------------------------------------------------------------
|
161 |
+
------------------------------------------------------------------
|
162 |
+
iteration: 19 , TFLOPS: 67.26840440584516, Tokens per sec: 4313.928292222649, Loss: 3.256293773651123
|
163 |
+
------------------------------------------------------------------
|
164 |
+
------------------------------------------------------------------
|
165 |
+
iteration: 20 , TFLOPS: 67.17348341042366, Tokens per sec: 4307.8410010003945, Loss: 3.3122451305389404
|
166 |
+
------------------------------------------------------------------
|
167 |
+
------------------------------------------------------------------
|
168 |
+
iteration: 21 , TFLOPS: 67.2001168793811, Tokens per sec: 4309.54900754924, Loss: 3.2204227447509766
|
169 |
+
------------------------------------------------------------------
|
170 |
+
------------------------------------------------------------------
|
171 |
+
iteration: 22 , TFLOPS: 67.23699865533753, Tokens per sec: 4311.914238866545, Loss: 3.2488620281219482
|
172 |
+
------------------------------------------------------------------
|
173 |
+
------------------------------------------------------------------
|
174 |
+
iteration: 23 , TFLOPS: 67.2425865851171, Tokens per sec: 4312.272593261658, Loss: 3.163287401199341
|
175 |
+
------------------------------------------------------------------
|
176 |
+
------------------------------------------------------------------
|
177 |
+
iteration: 24 , TFLOPS: 67.21941753377986, Tokens per sec: 4310.786760098965, Loss: 3.2160401344299316
|
178 |
+
------------------------------------------------------------------
|
179 |
+
------------------------------------------------------------------
|
180 |
+
iteration: 25 , TFLOPS: 67.09871135713247, Tokens per sec: 4303.04586308967, Loss: 3.0935139656066895
|
181 |
+
------------------------------------------------------------------
|
182 |
+
------------------------------------------------------------------
|
183 |
+
iteration: 26 , TFLOPS: 67.20080576079224, Tokens per sec: 4309.593185570642, Loss: 3.047175168991089
|
184 |
+
------------------------------------------------------------------
|
185 |
+
------------------------------------------------------------------
|
186 |
+
iteration: 27 , TFLOPS: 67.27441115034365, Tokens per sec: 4314.313505240039, Loss: 3.0304696559906006
|
187 |
+
------------------------------------------------------------------
|
188 |
+
------------------------------------------------------------------
|
189 |
+
iteration: 28 , TFLOPS: 67.26365793583362, Tokens per sec: 4313.623900711482, Loss: 3.0319135189056396
|
190 |
+
------------------------------------------------------------------
|
191 |
+
------------------------------------------------------------------
|
192 |
+
iteration: 29 , TFLOPS: 67.16464708688589, Tokens per sec: 4307.27432684712, Loss: 2.959254264831543
|
193 |
+
------------------------------------------------------------------
|
194 |
+
------------------------------------------------------------------
|
195 |
+
iteration: 30 , TFLOPS: 67.3000542568793, Tokens per sec: 4315.957999765541, Loss: 2.913499116897583
|
196 |
+
------------------------------------------------------------------
|
197 |
+
------------------------------------------------------------------
|
198 |
+
iteration: 31 , TFLOPS: 67.18211917043104, Tokens per sec: 4308.3948129980145, Loss: 2.940014362335205
|
199 |
+
------------------------------------------------------------------
|
200 |
+
------------------------------------------------------------------
|
201 |
+
iteration: 32 , TFLOPS: 67.25841762372463, Tokens per sec: 4313.287839066096, Loss: 2.8469998836517334
|
202 |
+
------------------------------------------------------------------
|
203 |
+
------------------------------------------------------------------
|
204 |
+
iteration: 33 , TFLOPS: 67.33731321073192, Tokens per sec: 4318.347419532266, Loss: 2.829812526702881
|
205 |
+
------------------------------------------------------------------
|
206 |
+
------------------------------------------------------------------
|
207 |
+
iteration: 34 , TFLOPS: 67.24161982046462, Tokens per sec: 4312.210594565195, Loss: 2.8521993160247803
|
208 |
+
------------------------------------------------------------------
|
209 |
+
------------------------------------------------------------------
|
210 |
+
iteration: 35 , TFLOPS: 67.24248740627992, Tokens per sec: 4312.266232914695, Loss: 2.8338708877563477
|
211 |
+
------------------------------------------------------------------
|
212 |
+
------------------------------------------------------------------
|
213 |
+
iteration: 36 , TFLOPS: 67.24777489174788, Tokens per sec: 4312.60531979146, Loss: 2.787545680999756
|
214 |
+
------------------------------------------------------------------
|
215 |
+
------------------------------------------------------------------
|
216 |
+
iteration: 37 , TFLOPS: 67.30205154448893, Tokens per sec: 4316.086085983773, Loss: 2.81471848487854
|
217 |
+
------------------------------------------------------------------
|
218 |
+
------------------------------------------------------------------
|
219 |
+
iteration: 38 , TFLOPS: 67.13737290861587, Tokens per sec: 4305.525231557506, Loss: 2.7764387130737305
|
220 |
+
------------------------------------------------------------------
|
221 |
+
------------------------------------------------------------------
|
222 |
+
iteration: 39 , TFLOPS: 67.22735358248879, Tokens per sec: 4311.295699553621, Loss: 2.7642412185668945
|
223 |
+
------------------------------------------------------------------
|
224 |
+
------------------------------------------------------------------
|
225 |
+
iteration: 40 , TFLOPS: 67.26715109677696, Tokens per sec: 4313.847917409303, Loss: 2.7132599353790283
|
226 |
+
------------------------------------------------------------------
|
227 |
+
------------------------------------------------------------------
|
228 |
+
iteration: 41 , TFLOPS: 67.23918606123682, Tokens per sec: 4312.054517386288, Loss: 2.668989896774292
|
229 |
+
------------------------------------------------------------------
|
230 |
+
------------------------------------------------------------------
|
231 |
+
iteration: 42 , TFLOPS: 67.13128246048267, Tokens per sec: 4305.134650619155, Loss: 2.6973328590393066
|
232 |
+
------------------------------------------------------------------
|
233 |
+
------------------------------------------------------------------
|
234 |
+
iteration: 43 , TFLOPS: 67.23091373690416, Tokens per sec: 4311.524012548299, Loss: 2.685912609100342
|
235 |
+
------------------------------------------------------------------
|
236 |
+
------------------------------------------------------------------
|
237 |
+
iteration: 44 , TFLOPS: 67.27693115124784, Tokens per sec: 4314.475113104727, Loss: 2.662001371383667
|
238 |
+
------------------------------------------------------------------
|
239 |
+
------------------------------------------------------------------
|
240 |
+
iteration: 45 , TFLOPS: 67.27965002709941, Tokens per sec: 4314.649474836105, Loss: 2.6665873527526855
|
241 |
+
------------------------------------------------------------------
|
242 |
+
------------------------------------------------------------------
|
243 |
+
iteration: 46 , TFLOPS: 67.15514015419501, Tokens per sec: 4306.664646473851, Loss: 2.6501307487487793
|
244 |
+
------------------------------------------------------------------
|
245 |
+
------------------------------------------------------------------
|
246 |
+
iteration: 47 , TFLOPS: 67.2760527329066, Tokens per sec: 4314.418780064453, Loss: 2.6316823959350586
|
247 |
+
------------------------------------------------------------------
|
248 |
+
------------------------------------------------------------------
|
249 |
+
iteration: 48 , TFLOPS: 67.25548187637087, Tokens per sec: 4313.099569347494, Loss: 2.6278648376464844
|
250 |
+
------------------------------------------------------------------
|
251 |
+
------------------------------------------------------------------
|
252 |
+
iteration: 49 , TFLOPS: 67.35263957774154, Tokens per sec: 4319.330300705736, Loss: 2.6157166957855225
|
253 |
+
------------------------------------------------------------------
|
254 |
+
------------------------------------------------------------------
|
255 |
+
iteration: 50 , TFLOPS: 67.32408825677271, Tokens per sec: 4317.499302150089, Loss: 2.5965774059295654
|
256 |
+
------------------------------------------------------------------
|
257 |
+
------------------------------------------------------------------
|
258 |
+
iteration: 51 , TFLOPS: 67.1953666892378, Tokens per sec: 4309.244377465717, Loss: 2.578054904937744
|
259 |
+
------------------------------------------------------------------
|
260 |
+
------------------------------------------------------------------
|
261 |
+
iteration: 52 , TFLOPS: 67.25156682148656, Tokens per sec: 4312.848496556634, Loss: 2.5468966960906982
|
262 |
+
------------------------------------------------------------------
|
263 |
+
------------------------------------------------------------------
|
264 |
+
iteration: 53 , TFLOPS: 67.32404734871982, Tokens per sec: 4317.496678713301, Loss: 2.53428316116333
|
265 |
+
------------------------------------------------------------------
|
266 |
+
------------------------------------------------------------------
|
267 |
+
iteration: 54 , TFLOPS: 67.15867426285547, Tokens per sec: 4306.89128915213, Loss: 2.545722246170044
|
268 |
+
------------------------------------------------------------------
|
269 |
+
------------------------------------------------------------------
|
270 |
+
iteration: 55 , TFLOPS: 67.27601676163123, Tokens per sec: 4314.416473223611, Loss: 2.5279200077056885
|
271 |
+
------------------------------------------------------------------
|
272 |
+
------------------------------------------------------------------
|
273 |
+
iteration: 56 , TFLOPS: 67.19740155918589, Tokens per sec: 4309.374873842397, Loss: 2.534917116165161
|
274 |
+
------------------------------------------------------------------
|
275 |
+
------------------------------------------------------------------
|
276 |
+
iteration: 57 , TFLOPS: 67.2461120484207, Tokens per sec: 4312.498681512492, Loss: 2.5658233165740967
|
277 |
+
------------------------------------------------------------------
|
278 |
+
------------------------------------------------------------------
|
279 |
+
iteration: 58 , TFLOPS: 67.2920938769174, Tokens per sec: 4315.447499945635, Loss: 2.5472288131713867
|
280 |
+
------------------------------------------------------------------
|
281 |
+
------------------------------------------------------------------
|
282 |
+
iteration: 59 , TFLOPS: 67.27804058384706, Tokens per sec: 4314.546261108317, Loss: 2.4994900226593018
|
283 |
+
------------------------------------------------------------------
|
284 |
+
------------------------------------------------------------------
|
285 |
+
iteration: 60 , TFLOPS: 67.28150855801171, Tokens per sec: 4314.768662575956, Loss: 2.502976417541504
|
286 |
+
------------------------------------------------------------------
|
287 |
+
------------------------------------------------------------------
|
288 |
+
iteration: 61 , TFLOPS: 67.3506410671317, Tokens per sec: 4319.2021360563995, Loss: 2.5281176567077637
|
289 |
+
------------------------------------------------------------------
|
290 |
+
------------------------------------------------------------------
|
291 |
+
iteration: 62 , TFLOPS: 67.23894764547772, Tokens per sec: 4312.039227764101, Loss: 2.514285087585449
|
292 |
+
------------------------------------------------------------------
|
293 |
+
------------------------------------------------------------------
|
294 |
+
iteration: 63 , TFLOPS: 67.26110814707724, Tokens per sec: 4313.460382549388, Loss: 2.482907772064209
|
295 |
+
------------------------------------------------------------------
|
296 |
+
------------------------------------------------------------------
|
297 |
+
iteration: 64 , TFLOPS: 67.16648997644158, Tokens per sec: 4307.39251150549, Loss: 2.4810938835144043
|
298 |
+
------------------------------------------------------------------
|
299 |
+
------------------------------------------------------------------
|
300 |
+
iteration: 65 , TFLOPS: 67.13380749324574, Tokens per sec: 4305.2965811773665, Loss: 2.4889049530029297
|
301 |
+
------------------------------------------------------------------
|
302 |
+
------------------------------------------------------------------
|
303 |
+
iteration: 66 , TFLOPS: 67.29568135916668, Tokens per sec: 4315.677565476544, Loss: 2.4739832878112793
|
304 |
+
------------------------------------------------------------------
|
305 |
+
------------------------------------------------------------------
|
306 |
+
iteration: 67 , TFLOPS: 67.2353824902874, Tokens per sec: 4311.810594069316, Loss: 2.4979248046875
|
307 |
+
------------------------------------------------------------------
|
308 |
+
------------------------------------------------------------------
|
309 |
+
iteration: 68 , TFLOPS: 67.16737608801321, Tokens per sec: 4307.449337913261, Loss: 2.4705636501312256
|
310 |
+
------------------------------------------------------------------
|
311 |
+
------------------------------------------------------------------
|
312 |
+
iteration: 69 , TFLOPS: 67.17368447741053, Tokens per sec: 4307.853895442756, Loss: 2.431494951248169
|
313 |
+
------------------------------------------------------------------
|
314 |
+
------------------------------------------------------------------
|
315 |
+
iteration: 70 , TFLOPS: 67.27513003078525, Tokens per sec: 4314.3596071017255, Loss: 2.4638864994049072
|
316 |
+
------------------------------------------------------------------
|
317 |
+
------------------------------------------------------------------
|
318 |
+
iteration: 71 , TFLOPS: 67.13314091760232, Tokens per sec: 4305.253833626679, Loss: 2.4194881916046143
|
319 |
+
------------------------------------------------------------------
|
320 |
+
------------------------------------------------------------------
|
321 |
+
iteration: 72 , TFLOPS: 67.35945536468331, Tokens per sec: 4319.767397681375, Loss: 2.4741766452789307
|
322 |
+
------------------------------------------------------------------
|
323 |
+
------------------------------------------------------------------
|
324 |
+
iteration: 73 , TFLOPS: 67.22132247798172, Tokens per sec: 4310.908924326882, Loss: 2.438474416732788
|
325 |
+
------------------------------------------------------------------
|
326 |
+
------------------------------------------------------------------
|
327 |
+
iteration: 74 , TFLOPS: 67.20619442505729, Tokens per sec: 4309.9387610519625, Loss: 2.466714859008789
|
328 |
+
------------------------------------------------------------------
|
329 |
+
------------------------------------------------------------------
|
330 |
+
iteration: 75 , TFLOPS: 67.2254479385552, Tokens per sec: 4311.17349045185, Loss: 2.4174747467041016
|
331 |
+
------------------------------------------------------------------
|
332 |
+
------------------------------------------------------------------
|
333 |
+
iteration: 76 , TFLOPS: 67.24521841222351, Tokens per sec: 4312.441372549867, Loss: 2.424267053604126
|
334 |
+
------------------------------------------------------------------
|
335 |
+
------------------------------------------------------------------
|
336 |
+
iteration: 77 , TFLOPS: 67.22922395995721, Tokens per sec: 4311.415647014088, Loss: 2.404212474822998
|
337 |
+
------------------------------------------------------------------
|
338 |
+
------------------------------------------------------------------
|
339 |
+
iteration: 78 , TFLOPS: 67.23452652330809, Tokens per sec: 4311.755700836721, Loss: 2.450658082962036
|
340 |
+
------------------------------------------------------------------
|
341 |
+
------------------------------------------------------------------
|
342 |
+
iteration: 79 , TFLOPS: 67.0846114872016, Tokens per sec: 4302.141637274464, Loss: 2.4231417179107666
|
343 |
+
------------------------------------------------------------------
|
344 |
+
------------------------------------------------------------------
|
345 |
+
iteration: 80 , TFLOPS: 67.17704276320255, Tokens per sec: 4308.069262586061, Loss: 2.413994312286377
|
346 |
+
------------------------------------------------------------------
|
347 |
+
------------------------------------------------------------------
|
348 |
+
iteration: 81 , TFLOPS: 67.2345689529718, Tokens per sec: 4311.758421854535, Loss: 2.4133667945861816
|
349 |
+
------------------------------------------------------------------
|
350 |
+
------------------------------------------------------------------
|
351 |
+
iteration: 82 , TFLOPS: 67.18505033340458, Tokens per sec: 4308.582788719936, Loss: 2.389362335205078
|
352 |
+
------------------------------------------------------------------
|
353 |
+
------------------------------------------------------------------
|
354 |
+
iteration: 83 , TFLOPS: 67.28162310992364, Tokens per sec: 4314.776008799464, Loss: 2.4374401569366455
|
355 |
+
------------------------------------------------------------------
|
356 |
+
------------------------------------------------------------------
|
357 |
+
iteration: 84 , TFLOPS: 67.2334157092426, Tokens per sec: 4311.684464239587, Loss: 2.3909661769866943
|
358 |
+
------------------------------------------------------------------
|
359 |
+
------------------------------------------------------------------
|
360 |
+
iteration: 85 , TFLOPS: 67.31368056601009, Tokens per sec: 4316.831856087792, Loss: 2.411787748336792
|
361 |
+
------------------------------------------------------------------
|
362 |
+
------------------------------------------------------------------
|
363 |
+
iteration: 86 , TFLOPS: 67.11865914241415, Tokens per sec: 4304.325116195997, Loss: 2.4398515224456787
|
364 |
+
------------------------------------------------------------------
|
365 |
+
------------------------------------------------------------------
|
366 |
+
iteration: 87 , TFLOPS: 67.24083693352927, Tokens per sec: 4312.160387961816, Loss: 2.3902275562286377
|
367 |
+
------------------------------------------------------------------
|
368 |
+
------------------------------------------------------------------
|
369 |
+
iteration: 88 , TFLOPS: 67.3222851144248, Tokens per sec: 4317.383666483415, Loss: 2.3877973556518555
|
370 |
+
------------------------------------------------------------------
|
371 |
+
------------------------------------------------------------------
|
372 |
+
iteration: 89 , TFLOPS: 67.14511488288893, Tokens per sec: 4306.021725002672, Loss: 2.376176357269287
|
373 |
+
------------------------------------------------------------------
|
374 |
+
------------------------------------------------------------------
|
375 |
+
iteration: 90 , TFLOPS: 67.29125521000229, Tokens per sec: 4315.3937161675785, Loss: 2.3973848819732666
|
376 |
+
------------------------------------------------------------------
|
377 |
+
------------------------------------------------------------------
|
378 |
+
iteration: 91 , TFLOPS: 67.1356528047859, Tokens per sec: 4305.414921157799, Loss: 2.388991355895996
|
379 |
+
------------------------------------------------------------------
|
380 |
+
------------------------------------------------------------------
|
381 |
+
iteration: 92 , TFLOPS: 67.25754211457983, Tokens per sec: 4313.231692592827, Loss: 2.383312463760376
|
382 |
+
------------------------------------------------------------------
|
383 |
+
------------------------------------------------------------------
|
384 |
+
iteration: 93 , TFLOPS: 67.15498729921683, Tokens per sec: 4306.654843871562, Loss: 2.3923604488372803
|
385 |
+
------------------------------------------------------------------
|
386 |
+
------------------------------------------------------------------
|
387 |
+
iteration: 94 , TFLOPS: 67.32478814446085, Tokens per sec: 4317.544186004938, Loss: 2.3716728687286377
|
388 |
+
------------------------------------------------------------------
|
389 |
+
------------------------------------------------------------------
|
390 |
+
iteration: 95 , TFLOPS: 67.3161465459375, Tokens per sec: 4316.989999582809, Loss: 2.405150890350342
|
391 |
+
------------------------------------------------------------------
|
392 |
+
------------------------------------------------------------------
|
393 |
+
iteration: 96 , TFLOPS: 67.20162737067454, Tokens per sec: 4309.645875479786, Loss: 2.365361213684082
|
394 |
+
------------------------------------------------------------------
|
395 |
+
------------------------------------------------------------------
|
396 |
+
iteration: 97 , TFLOPS: 67.17173577081181, Tokens per sec: 4307.728924728738, Loss: 2.3839645385742188
|
397 |
+
------------------------------------------------------------------
|
398 |
+
------------------------------------------------------------------
|
399 |
+
iteration: 98 , TFLOPS: 67.20004987934048, Tokens per sec: 4309.544710831139, Loss: 2.3723373413085938
|
400 |
+
------------------------------------------------------------------
|
401 |
+
------------------------------------------------------------------
|
402 |
+
iteration: 99 , TFLOPS: 67.30991336059388, Tokens per sec: 4316.590264895447, Loss: 2.3913819789886475
|
403 |
+
------------------------------------------------------------------
|
404 |
+
------------------------------------------------------------------
|
405 |
+
iteration: 100 , TFLOPS: 67.23987549288418, Tokens per sec: 4312.098730694383, Loss: 2.3768458366394043
|
406 |
+
------------------------------------------------------------------
|
407 |
+
------------------------------------------------------------------
|
408 |
+
iteration: 101 , TFLOPS: 67.33907694033823, Tokens per sec: 4318.460527656589, Loss: 2.3836305141448975
|
409 |
+
------------------------------------------------------------------
|
410 |
+
------------------------------------------------------------------
|
411 |
+
iteration: 102 , TFLOPS: 67.30975607840512, Tokens per sec: 4316.580178375781, Loss: 2.3950178623199463
|
412 |
+
------------------------------------------------------------------
|
413 |
+
------------------------------------------------------------------
|
414 |
+
iteration: 103 , TFLOPS: 67.1982354002556, Tokens per sec: 4309.428348138593, Loss: 2.361278772354126
|
415 |
+
------------------------------------------------------------------
|
416 |
+
------------------------------------------------------------------
|
417 |
+
iteration: 104 , TFLOPS: 67.20376894334782, Tokens per sec: 4309.783214710986, Loss: 2.3559556007385254
|
418 |
+
------------------------------------------------------------------
|
419 |
+
------------------------------------------------------------------
|
420 |
+
iteration: 105 , TFLOPS: 67.23013357946196, Tokens per sec: 4311.47398098754, Loss: 2.349632740020752
|
421 |
+
------------------------------------------------------------------
|
422 |
+
------------------------------------------------------------------
|
423 |
+
iteration: 106 , TFLOPS: 67.23129147862021, Tokens per sec: 4311.548237155534, Loss: 2.379448652267456
|
424 |
+
------------------------------------------------------------------
|
425 |
+
------------------------------------------------------------------
|
426 |
+
iteration: 107 , TFLOPS: 67.16429762559119, Tokens per sec: 4307.251915865627, Loss: 2.4072415828704834
|
427 |
+
------------------------------------------------------------------
|
428 |
+
------------------------------------------------------------------
|
429 |
+
iteration: 108 , TFLOPS: 67.26025670890765, Tokens per sec: 4313.405779749734, Loss: 2.3945987224578857
|
430 |
+
------------------------------------------------------------------
|
431 |
+
------------------------------------------------------------------
|
432 |
+
iteration: 109 , TFLOPS: 67.13558642664209, Tokens per sec: 4305.410664321992, Loss: 2.3535115718841553
|
433 |
+
------------------------------------------------------------------
|
434 |
+
------------------------------------------------------------------
|
435 |
+
iteration: 110 , TFLOPS: 67.27982379366702, Tokens per sec: 4314.660618500338, Loss: 2.3627665042877197
|
436 |
+
------------------------------------------------------------------
|
437 |
+
------------------------------------------------------------------
|
438 |
+
iteration: 111 , TFLOPS: 67.26391532288811, Tokens per sec: 4313.640406964398, Loss: 2.3859591484069824
|
439 |
+
------------------------------------------------------------------
|
440 |
+
------------------------------------------------------------------
|
441 |
+
iteration: 112 , TFLOPS: 67.27053505647855, Tokens per sec: 4314.064931022535, Loss: 2.3465442657470703
|
442 |
+
------------------------------------------------------------------
|
443 |
+
------------------------------------------------------------------
|
444 |
+
iteration: 113 , TFLOPS: 67.22654753561278, Tokens per sec: 4311.244007701346, Loss: 2.396284818649292
|
445 |
+
------------------------------------------------------------------
|
446 |
+
------------------------------------------------------------------
|
447 |
+
iteration: 114 , TFLOPS: 67.12289176484347, Tokens per sec: 4304.596554619569, Loss: 2.3716585636138916
|
448 |
+
------------------------------------------------------------------
|
449 |
+
------------------------------------------------------------------
|
450 |
+
iteration: 115 , TFLOPS: 67.13262769476694, Tokens per sec: 4305.220920604149, Loss: 2.3369154930114746
|
451 |
+
------------------------------------------------------------------
|
452 |
+
------------------------------------------------------------------
|
453 |
+
iteration: 116 , TFLOPS: 67.17146478693049, Tokens per sec: 4307.711546510201, Loss: 2.302396535873413
|
454 |
+
------------------------------------------------------------------
|
455 |
+
Traceback (most recent call last):
|
456 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
457 |
+
main()
|
458 |
+
File "/project/src/llama_recipes/finetuning.py", line 281, in main
|
459 |
+
train(
|
460 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 118, in train
|
461 |
+
loss.backward()
|
462 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
|
463 |
+
torch.autograd.backward(
|
464 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 267, in backward
|
465 |
+
_engine_run_backward(
|
466 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 681, in _engine_run_backward
|
467 |
+
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
468 |
+
KeyboardInterrupt
|
wandb/run-20240804_222226-kh5katc1/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240804_222226-kh5katc1/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-04T13:22:26.872566",
|
5 |
+
"startedAt": "2024-08-04T13:22:26.250232",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"4096",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"1",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"HFPreTrainedTokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/google/gemma-2-2b",
|
23 |
+
"--train-data-path",
|
24 |
+
"235289369",
|
25 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"235289369",
|
28 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"235289369",
|
31 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"anyprecision",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"200",
|
56 |
+
"--eval-interval",
|
57 |
+
"200",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/google/gemma-2-2b",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/yans-sample-gemma-2-2b",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/yans-sample-gemma-2-2b",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/yans-sample-gemma-2-2b",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"yans-sample-gemma-2-2b_train_2024-08-04-22:22:15"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "0336bd6c20fe25d78eda1d14afa66c1ae2e6d687"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.044999999999,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.045,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.045,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.045,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.045,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.045,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.045,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.045,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.045,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.045,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.045,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.045,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.045,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.045,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.045,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.045,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.045,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.045,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.045,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48782730102539
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240804_222226-kh5katc1/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"training/loss": 2.302396535873413, "training/perplexity": 9.99811460655144, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 116, "optimizer/lr": 5.4080000000000006e-06, "optimizer/variance_l2": 0.0030219239359304895, "optimizer/variance_sqrt_l2": 0.8405880490942215, "optimizer/momentum_l2": 0.36270596473675665, "optimizer/weight_l2": 1167.8420269882395, "optimizer/variance_l1": 0.70648193359375, "optimizer/variance_sqrt_l1": 19948.0, "optimizer/momentum_l1": 5862.0, "optimizer/weight_l1": 29775872.0, "optimizer/variance_abs_max": 0.001068115234375, "optimizer/variance_sqrt_abs_max": 0.03271484375, "optimizer/momentum_abs_max": 0.0250244140625, "optimizer/weight_abs_max": 12.9375, "stats/1_iteration_time": 304.34721216700564, "stats/tokens_per_sec": 4307.711546510201, "stats/tokens_per_sec_per_gpu": 4307.711546510201, "stats/tflops": 67.17146478693049, "_timestamp": 1722812960.6351748, "_runtime": 35214.36805868149, "_step": 116, "_wandb": {"runtime": 35371}}
|
wandb/run-20240804_222226-kh5katc1/logs/debug-internal.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/run-20240804_222226-kh5katc1/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-04 22:22:26,260 INFO MainThread:12896 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_setup.py:_flush():76] Configure stats pid to 12896
|
3 |
+
2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
|
6 |
+
2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_222226-kh5katc1/logs/debug.log
|
9 |
+
2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_222226-kh5katc1/logs/debug-internal.log
|
10 |
+
2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/google/gemma-2-2b', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-gemma-2-2b_train_2024-08-04-22:22:15', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'save': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'base_model': '/share/pretrained_lm/google/gemma-2-2b', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-gemma-2-2b', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 256000, 'gradient_accumulation_steps': 320}
|
13 |
+
2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-04 22:22:26,261 INFO MainThread:12896 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-04 22:22:26,266 INFO MainThread:12896 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-04 22:22:26,266 INFO MainThread:12896 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-04 22:22:26,271 INFO MainThread:12896 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-04 22:22:26,282 INFO MainThread:12896 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-04 22:22:26,766 INFO MainThread:12896 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-04 22:22:26,847 INFO MainThread:12896 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-04 22:22:26,847 INFO MainThread:12896 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-04 22:22:26,902 INFO MainThread:12896 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-04 22:22:26,902 INFO MainThread:12896 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-04 22:22:26,903 INFO MainThread:12896 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-04 22:22:26,903 INFO MainThread:12896 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-04 22:22:26,903 INFO MainThread:12896 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-04 22:22:32,202 INFO MainThread:12896 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'gelu_pytorch_tanh', 'hidden_size': 2304, 'model_type': 'gemma2', 'max_position_embeddings': 4096, 'num_attention_heads': 8, 'num_hidden_layers': 26, 'model_architecture': 'Gemma2ForCausalLM'}
|
29 |
+
2024-08-04 22:22:32,203 INFO MainThread:12896 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
30 |
+
2024-08-05 08:12:06,481 WARNING MsgRouterThr:12896 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240812_063447-whqmtxyq/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '235289369'
|
31 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '235289369'
|
36 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '235289369'
|
41 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 1021
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: HFPreTrainedTokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/google/gemma-2-2b
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: yans-sample-gemma-2-2b_train_2024-08-12-06:34:36
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/yans-sample-gemma-2-2b
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/yans-sample-gemma-2-2b
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/google/gemma-2-2b
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 3
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 3
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: anyprecision
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 1
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/yans-sample-gemma-2-2b
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 256000
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 320
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1723412087.358797
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
model_architecture:
|
316 |
+
desc: null
|
317 |
+
value: Gemma2ForCausalLM
|
318 |
+
activation_function:
|
319 |
+
desc: null
|
320 |
+
value: gelu_pytorch_tanh
|
321 |
+
hidden_size:
|
322 |
+
desc: null
|
323 |
+
value: 2304
|
324 |
+
model_type:
|
325 |
+
desc: null
|
326 |
+
value: gemma2
|
327 |
+
max_position_embeddings:
|
328 |
+
desc: null
|
329 |
+
value: 1021
|
330 |
+
num_attention_heads:
|
331 |
+
desc: null
|
332 |
+
value: 8
|
333 |
+
num_hidden_layers:
|
334 |
+
desc: null
|
335 |
+
value: 26
|
wandb/run-20240812_063447-whqmtxyq/files/output.log
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/yans-sample-gemma-2-2b.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
5 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
6 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
7 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
8 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
9 |
+
|
10 |
+
Loading checkpoint shards: 33%|███▎ | 1/3 [00:01<00:03, 1.92s/it]
|
11 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
12 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
13 |
+
No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping model loading
|
14 |
+
--> Model /share/pretrained_lm/google/gemma-2-2b
|
15 |
+
--> /share/pretrained_lm/google/gemma-2-2b has 2614.341888 Million params
|
16 |
+
Loading checkpoint shards: 100%|██████████| 3/3 [01:18<00:00, 26.21s/it]
|
17 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
18 |
+
warnings.warn(
|
19 |
+
Let split = None
|
20 |
+
--> applying fsdp activation checkpointing...
|
21 |
+
> datasets target sizes (minimum size):
|
22 |
+
train: 6400000
|
23 |
+
validation: 21334400
|
24 |
+
test: 3200
|
25 |
+
> building train, validation, and test datasets for GPT ...
|
26 |
+
> finished creating GPT datasets ...
|
27 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
28 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
29 |
+
No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping optimizer loading
|
30 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
31 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
32 |
+
model info: FullyShardedDataParallel(
|
33 |
+
(_fsdp_wrapped_module): Gemma2ForCausalLM(
|
34 |
+
(model): Gemma2Model(
|
35 |
+
(embed_tokens): Embedding(256000, 2304, padding_idx=0)
|
36 |
+
(layers): ModuleList(
|
37 |
+
(0-25): 26 x FullyShardedDataParallel(
|
38 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
39 |
+
(_checkpoint_wrapped_module): Gemma2DecoderLayer(
|
40 |
+
(self_attn): Gemma2FlashAttention2(
|
41 |
+
(q_proj): Linear(in_features=2304, out_features=2048, bias=False)
|
42 |
+
(k_proj): Linear(in_features=2304, out_features=1024, bias=False)
|
43 |
+
(v_proj): Linear(in_features=2304, out_features=1024, bias=False)
|
44 |
+
(o_proj): Linear(in_features=2048, out_features=2304, bias=False)
|
45 |
+
(rotary_emb): Gemma2RotaryEmbedding()
|
46 |
+
)
|
47 |
+
(mlp): Gemma2MLP(
|
48 |
+
(gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
|
49 |
+
(up_proj): Linear(in_features=2304, out_features=9216, bias=False)
|
50 |
+
(down_proj): Linear(in_features=9216, out_features=2304, bias=False)
|
51 |
+
(act_fn): PytorchGELUTanh()
|
52 |
+
)
|
53 |
+
(input_layernorm): Gemma2RMSNorm()
|
54 |
+
(post_attention_layernorm): Gemma2RMSNorm()
|
55 |
+
(pre_feedforward_layernorm): Gemma2RMSNorm()
|
56 |
+
(post_feedforward_layernorm): Gemma2RMSNorm()
|
57 |
+
)
|
58 |
+
)
|
59 |
+
)
|
60 |
+
)
|
61 |
+
(norm): Gemma2RMSNorm()
|
62 |
+
)
|
63 |
+
(lm_head): Linear(in_features=2304, out_features=256000, bias=False)
|
64 |
+
)
|
65 |
+
)
|
66 |
+
model config: Gemma2Config {
|
67 |
+
"_name_or_path": "/share/pretrained_lm/google/gemma-2-2b",
|
68 |
+
"architectures": [
|
69 |
+
"Gemma2ForCausalLM"
|
70 |
+
],
|
71 |
+
"attention_bias": false,
|
72 |
+
"attention_dropout": 0.0,
|
73 |
+
"attn_logit_softcapping": 50.0,
|
74 |
+
"bos_token_id": 2,
|
75 |
+
"cache_implementation": "hybrid",
|
76 |
+
"eos_token_id": 1,
|
77 |
+
"final_logit_softcapping": 30.0,
|
78 |
+
"head_dim": 256,
|
79 |
+
"hidden_act": "gelu_pytorch_tanh",
|
80 |
+
"hidden_activation": "gelu_pytorch_tanh",
|
81 |
+
"hidden_size": 2304,
|
82 |
+
"initializer_range": 0.02,
|
83 |
+
"intermediate_size": 9216,
|
84 |
+
"label_smoothing": 0.0,
|
85 |
+
"max_position_embeddings": 1021,
|
86 |
+
"model_type": "gemma2",
|
87 |
+
Building a BlendedDataset for a single MegatronDataset
|
88 |
+
Unable to save the indexes because path_to_cache is None
|
89 |
+
Building a BlendedDataset for a single MegatronDataset
|
90 |
+
Unable to save the indexes because path_to_cache is None
|
91 |
+
Building a BlendedDataset for a single MegatronDataset
|
92 |
+
Unable to save the indexes because path_to_cache is None
|
93 |
+
It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
|
94 |
+
"num_attention_heads": 8,
|
95 |
+
"num_hidden_layers": 26,
|
96 |
+
"num_key_value_heads": 4,
|
97 |
+
"pad_token_id": 0,
|
98 |
+
"query_pre_attn_scalar": 256,
|
99 |
+
"rms_norm_eps": 1e-06,
|
100 |
+
"rope_theta": 10000.0,
|
101 |
+
"sliding_window": 4096,
|
102 |
+
"torch_dtype": "float32",
|
103 |
+
"transformers_version": "4.43.3",
|
104 |
+
"use_cache": false,
|
105 |
+
"vocab_size": 256000
|
106 |
+
}
|
107 |
+
------------------------------------------------------------------
|
108 |
+
iteration: 1 , TFLOPS: 52.56331460229552, Tokens per sec: 3927.6626762354495, Loss: 16.080825805664062
|
109 |
+
------------------------------------------------------------------
|
110 |
+
------------------------------------------------------------------
|
111 |
+
iteration: 2 , TFLOPS: 52.356892101499724, Tokens per sec: 3912.238269345489, Loss: 15.729490280151367
|
112 |
+
------------------------------------------------------------------
|
113 |
+
------------------------------------------------------------------
|
114 |
+
iteration: 3 , TFLOPS: 52.39645244456057, Tokens per sec: 3915.194317381553, Loss: 15.54540729522705
|
115 |
+
------------------------------------------------------------------
|
116 |
+
eval ppl=4948606.5, eval loss=15.414616584777832
|
117 |
+
Saving checkpoint to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003
|
118 |
+
Saving model state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/model.pt
|
119 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
120 |
+
warnings.warn(
|
121 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
122 |
+
warnings.warn(
|
123 |
+
Saved model state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/model.pt
|
124 |
+
Saving optimizer state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/optimizer.pt
|
125 |
+
[rank0]:[2024-08-12 06:40:35,335] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.008401250000133587, 'preprocessing_with_comm': 0.0009138020004684222, 'state_converting': 5.079375774000255, <Type.ALL: 'all'>: 5.090390497000044})
|
126 |
+
Saved optimizer state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/optimizer.pt
|
127 |
+
Traceback (most recent call last):
|
128 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
129 |
+
main()
|
130 |
+
File "/project/src/llama_recipes/finetuning.py", line 281, in main
|
131 |
+
train(
|
132 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 175, in train
|
133 |
+
save_checkpoint(
|
134 |
+
File "/project/src/llama_recipes/utils/checkpoint.py", line 168, in save_checkpoint
|
135 |
+
tokenizer.tokenizer.save_pretrained(tokenizer_path)
|
136 |
+
File "/project/lib/transformers/src/transformers/tokenization_utils_base.py", line 2622, in save_pretrained
|
137 |
+
if os.path.isfile(save_directory):
|
138 |
+
File "/usr/lib/python3.10/genericpath.py", line 30, in isfile
|
139 |
+
st = os.stat(path)
|
140 |
+
TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType
|
141 |
+
Saving scheduler state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/scheduler.pt
|
142 |
+
Saved scheduler state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/scheduler.pt
|
143 |
+
Saving RNG states to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/rng.pt
|
144 |
+
Saved RNG states to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/rng.pt
|
wandb/run-20240812_063447-whqmtxyq/files/requirements.txt
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer==3.3.2
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
cloudpickle==3.0.0
|
26 |
+
cmake==3.28.1
|
27 |
+
colorama==0.4.6
|
28 |
+
comm==0.2.1
|
29 |
+
confection==0.1.4
|
30 |
+
contourpy==1.2.0
|
31 |
+
cubinlinker==0.3.0+2.g405ac64
|
32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
33 |
+
cudf==23.12.0
|
34 |
+
cugraph-dgl==23.12.0
|
35 |
+
cugraph-service-client==23.12.0
|
36 |
+
cugraph-service-server==23.12.0
|
37 |
+
cugraph==23.12.0
|
38 |
+
cuml==23.12.0
|
39 |
+
cupy-cuda12x==12.3.0
|
40 |
+
cycler==0.12.1
|
41 |
+
cymem==2.0.8
|
42 |
+
cython==3.0.8
|
43 |
+
dask-cuda==23.12.0
|
44 |
+
dask-cudf==23.12.0
|
45 |
+
dask==2023.11.0
|
46 |
+
debugpy==1.8.1
|
47 |
+
decorator==5.1.1
|
48 |
+
defusedxml==0.7.1
|
49 |
+
distributed==2023.11.0
|
50 |
+
dm-tree==0.1.8
|
51 |
+
docker-pycreds==0.4.0
|
52 |
+
einops==0.7.0
|
53 |
+
exceptiongroup==1.2.0
|
54 |
+
execnet==2.0.2
|
55 |
+
executing==2.0.1
|
56 |
+
expecttest==0.1.3
|
57 |
+
fastjsonschema==2.19.1
|
58 |
+
fastrlock==0.8.2
|
59 |
+
filelock==3.13.1
|
60 |
+
flash-attn==2.4.2
|
61 |
+
fonttools==4.48.1
|
62 |
+
frozenlist==1.4.1
|
63 |
+
fsspec==2023.12.2
|
64 |
+
gast==0.5.4
|
65 |
+
gitdb==4.0.11
|
66 |
+
gitpython==3.1.43
|
67 |
+
google-auth-oauthlib==0.4.6
|
68 |
+
google-auth==2.27.0
|
69 |
+
graphsurgeon==0.4.6
|
70 |
+
grpcio==1.60.1
|
71 |
+
huggingface-hub==0.24.5
|
72 |
+
hypothesis==5.35.1
|
73 |
+
idna==3.6
|
74 |
+
importlib-metadata==7.0.1
|
75 |
+
iniconfig==2.0.0
|
76 |
+
intel-openmp==2021.4.0
|
77 |
+
ipadic==1.0.0
|
78 |
+
ipykernel==6.29.2
|
79 |
+
ipython-genutils==0.2.0
|
80 |
+
ipython==8.21.0
|
81 |
+
jedi==0.19.1
|
82 |
+
jinja2==3.1.3
|
83 |
+
joblib==1.3.2
|
84 |
+
json5==0.9.14
|
85 |
+
jsonnet==0.19.1
|
86 |
+
jsonschema-specifications==2023.12.1
|
87 |
+
jsonschema==4.21.1
|
88 |
+
jupyter-client==8.6.0
|
89 |
+
jupyter-core==5.7.1
|
90 |
+
jupyter-tensorboard==0.2.0
|
91 |
+
jupyterlab-pygments==0.3.0
|
92 |
+
jupyterlab-server==1.2.0
|
93 |
+
jupyterlab==2.3.2
|
94 |
+
jupytext==1.16.1
|
95 |
+
kiwisolver==1.4.5
|
96 |
+
langcodes==3.3.0
|
97 |
+
lazy-loader==0.3
|
98 |
+
librosa==0.10.1
|
99 |
+
llvmlite==0.40.1
|
100 |
+
locket==1.0.0
|
101 |
+
logzero==1.7.0
|
102 |
+
lxml==5.2.2
|
103 |
+
markdown-it-py==3.0.0
|
104 |
+
markdown==3.5.2
|
105 |
+
markupsafe==2.1.4
|
106 |
+
matplotlib-inline==0.1.6
|
107 |
+
matplotlib==3.8.2
|
108 |
+
mdit-py-plugins==0.4.0
|
109 |
+
mdurl==0.1.2
|
110 |
+
mecab-python3==1.0.6
|
111 |
+
mistune==3.0.2
|
112 |
+
mkl-devel==2021.1.1
|
113 |
+
mkl-include==2021.1.1
|
114 |
+
mkl==2021.1.1
|
115 |
+
mock==5.1.0
|
116 |
+
more-itertools==9.1.0
|
117 |
+
mpmath==1.3.0
|
118 |
+
msgpack==1.0.7
|
119 |
+
multidict==6.0.4
|
120 |
+
murmurhash==1.0.10
|
121 |
+
nbclient==0.9.0
|
122 |
+
nbconvert==7.16.0
|
123 |
+
nbformat==5.9.2
|
124 |
+
nest-asyncio==1.6.0
|
125 |
+
networkx==2.6.3
|
126 |
+
ninja==1.11.1.1
|
127 |
+
nltk==3.8.1
|
128 |
+
notebook==6.4.10
|
129 |
+
numba==0.57.1+1.g1ff679645
|
130 |
+
numpy==1.24.4
|
131 |
+
nvfuser==0.1.4a0+d0bb811
|
132 |
+
nvidia-dali-cuda120==1.34.0
|
133 |
+
nvidia-pyindex==1.0.9
|
134 |
+
nvtx==0.2.5
|
135 |
+
oauthlib==3.2.2
|
136 |
+
onnx==1.15.0rc2
|
137 |
+
opencv==4.7.0
|
138 |
+
optree==0.10.0
|
139 |
+
packaging==23.2
|
140 |
+
pandas==1.5.3
|
141 |
+
pandocfilters==1.5.1
|
142 |
+
parso==0.8.3
|
143 |
+
partd==1.4.1
|
144 |
+
peft==0.11.1
|
145 |
+
pexpect==4.9.0
|
146 |
+
pillow==10.2.0
|
147 |
+
pip==24.0
|
148 |
+
platformdirs==4.2.0
|
149 |
+
pluggy==1.4.0
|
150 |
+
ply==3.11
|
151 |
+
polygraphy==0.49.4
|
152 |
+
pooch==1.8.0
|
153 |
+
portalocker==2.10.1
|
154 |
+
preshed==3.0.9
|
155 |
+
prettytable==3.9.0
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.24.4
|
159 |
+
psutil==5.9.4
|
160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
161 |
+
ptyprocess==0.7.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
164 |
+
pyasn1-modules==0.3.0
|
165 |
+
pyasn1==0.5.1
|
166 |
+
pybind11-global==2.11.1
|
167 |
+
pybind11==2.11.1
|
168 |
+
pycocotools==2.0+nv0.8.0
|
169 |
+
pycparser==2.21
|
170 |
+
pydantic-core==2.16.2
|
171 |
+
pydantic==2.6.1
|
172 |
+
pygments==2.17.2
|
173 |
+
pylibcugraph==23.12.0
|
174 |
+
pylibcugraphops==23.12.0
|
175 |
+
pylibraft==23.12.0
|
176 |
+
pynvml==11.4.1
|
177 |
+
pyparsing==3.1.1
|
178 |
+
pytest-flakefinder==1.1.0
|
179 |
+
pytest-rerunfailures==13.0
|
180 |
+
pytest-shard==0.1.2
|
181 |
+
pytest-xdist==3.5.0
|
182 |
+
pytest==8.0.0
|
183 |
+
python-dateutil==2.8.2
|
184 |
+
python-dotenv==1.0.0
|
185 |
+
python-hostlist==1.23.0
|
186 |
+
pytorch-quantization==2.1.2
|
187 |
+
pytz==2023.3.post1
|
188 |
+
pyyaml==6.0.1
|
189 |
+
pyzmq==25.1.2
|
190 |
+
raft-dask==23.12.0
|
191 |
+
rapids-dask-dependency==23.12.1
|
192 |
+
referencing==0.33.0
|
193 |
+
regex==2023.12.25
|
194 |
+
requests-oauthlib==1.3.1
|
195 |
+
requests==2.31.0
|
196 |
+
rich==13.7.0
|
197 |
+
rmm==23.12.0
|
198 |
+
rpds-py==0.17.1
|
199 |
+
rsa==4.9
|
200 |
+
sacrebleu==2.4.0
|
201 |
+
safetensors==0.4.3
|
202 |
+
scikit-learn==1.2.0
|
203 |
+
scipy==1.12.0
|
204 |
+
send2trash==1.8.2
|
205 |
+
sentencepiece==0.1.99
|
206 |
+
sentry-sdk==2.12.0
|
207 |
+
setproctitle==1.3.3
|
208 |
+
setuptools==68.2.2
|
209 |
+
six==1.16.0
|
210 |
+
smart-open==6.4.0
|
211 |
+
smmap==5.0.1
|
212 |
+
sortedcontainers==2.4.0
|
213 |
+
soundfile==0.12.1
|
214 |
+
soupsieve==2.5
|
215 |
+
soxr==0.3.7
|
216 |
+
spacy-legacy==3.0.12
|
217 |
+
spacy-loggers==1.0.5
|
218 |
+
spacy==3.7.2
|
219 |
+
sphinx-glpi-theme==0.6
|
220 |
+
srsly==2.4.8
|
221 |
+
stack-data==0.6.3
|
222 |
+
sympy==1.12
|
223 |
+
tabulate==0.9.0
|
224 |
+
tbb==2021.11.0
|
225 |
+
tblib==3.0.0
|
226 |
+
tensorboard-data-server==0.6.1
|
227 |
+
tensorboard-plugin-wit==1.8.1
|
228 |
+
tensorboard==2.9.0
|
229 |
+
tensorrt==8.6.3
|
230 |
+
terminado==0.18.0
|
231 |
+
termplotlib==0.3.9
|
232 |
+
thinc==8.2.3
|
233 |
+
threadpoolctl==3.2.0
|
234 |
+
thriftpy2==0.4.17
|
235 |
+
tinycss2==1.2.1
|
236 |
+
tokenizers==0.19.1
|
237 |
+
toml==0.10.2
|
238 |
+
tomli==2.0.1
|
239 |
+
toolz==0.12.1
|
240 |
+
torch-tensorrt==2.3.0a0
|
241 |
+
torch==2.3.0a0+ebedce2
|
242 |
+
torchdata==0.7.1a0
|
243 |
+
torchtext==0.17.0a0
|
244 |
+
torchvision==0.18.0a0
|
245 |
+
tornado==6.4
|
246 |
+
tqdm==4.66.1
|
247 |
+
traitlets==5.9.0
|
248 |
+
transformer-engine==1.3.0+5b90b7f
|
249 |
+
transformers==4.43.3
|
250 |
+
treelite-runtime==3.9.1
|
251 |
+
treelite==3.9.1
|
252 |
+
triton==2.2.0+e28a256
|
253 |
+
typer==0.9.0
|
254 |
+
types-dataclasses==0.6.6
|
255 |
+
typing-extensions==4.9.0
|
256 |
+
ucx-py==0.35.0
|
257 |
+
uff==0.6.9
|
258 |
+
ujson==5.8.0
|
259 |
+
urllib3==1.26.18
|
260 |
+
wandb==0.16.3
|
261 |
+
wasabi==1.1.2
|
262 |
+
wcwidth==0.2.13
|
263 |
+
weasel==0.3.4
|
264 |
+
webencodings==0.5.1
|
265 |
+
werkzeug==3.0.1
|
266 |
+
wheel==0.42.0
|
267 |
+
xdoctest==1.0.2
|
268 |
+
xgboost==1.7.6
|
269 |
+
yarl==1.9.4
|
270 |
+
zict==3.0.0
|
271 |
+
zipp==3.17.0
|
wandb/run-20240812_063447-whqmtxyq/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-11T21:34:47.942238",
|
5 |
+
"startedAt": "2024-08-11T21:34:47.345817",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"1021",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"1",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"HFPreTrainedTokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/google/gemma-2-2b",
|
23 |
+
"--train-data-path",
|
24 |
+
"235289369",
|
25 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"235289369",
|
28 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"235289369",
|
31 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"anyprecision",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"3",
|
56 |
+
"--eval-interval",
|
57 |
+
"3",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/google/gemma-2-2b",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/yans-sample-gemma-2-2b",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/yans-sample-gemma-2-2b",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/yans-sample-gemma-2-2b",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"yans-sample-gemma-2-2b_train_2024-08-12-06:34:36"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.0429999999997,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.043,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.043,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.043,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.043,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.043,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.043,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.043,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.043,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.043,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.043,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.043,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.043,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.043,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.043,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.043,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.043,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.043,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.043,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.487823486328125
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"training/loss": 15.54540729522705, "training/perplexity": 5640071.469138662, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 1022, "utils/gradient_accumulation_steps": 320, "utils/iteration": 3, "optimizer/lr": 1.114e-06, "optimizer/variance_l2": 0.0003583679885385243, "optimizer/variance_sqrt_l2": 0.3777214531330342, "optimizer/momentum_l2": 0.26258589724268894, "optimizer/weight_l2": 1167.8420269882395, "optimizer/variance_l1": 0.14256858825683594, "optimizer/variance_sqrt_l1": 5085.8125, "optimizer/momentum_l1": 3147.65625, "optimizer/weight_l1": 29773824.0, "optimizer/variance_abs_max": 7.009506225585938e-05, "optimizer/variance_sqrt_abs_max": 0.00836181640625, "optimizer/momentum_abs_max": 0.005950927734375, "optimizer/weight_abs_max": 12.9375, "stats/1_iteration_time": 83.53097534600056, "stats/tokens_per_sec": 3915.194317381553, "stats/tokens_per_sec_per_gpu": 3915.194317381553, "stats/tflops": 52.39645244456057, "_timestamp": 1723412421.3049276, "_runtime": 333.9461305141449, "_step": 3, "_wandb": {"runtime": 356}, "evaluation/val_loss": 15.414616584777832, "evaluation/val_ppl": 4948606.5}
|
wandb/run-20240812_063447-whqmtxyq/logs/debug-internal.log
ADDED
@@ -0,0 +1,359 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-12 06:34:47,360 INFO StreamThr :13101 [internal.py:wandb_internal():86] W&B internal server running at pid: 13101, started at: 2024-08-12 06:34:47.359620
|
2 |
+
2024-08-12 06:34:47,362 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-12 06:34:47,363 INFO WriterThread:13101 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_063447-whqmtxyq/run-whqmtxyq.wandb
|
4 |
+
2024-08-12 06:34:47,364 DEBUG SenderThread:13101 [sender.py:send():382] send: header
|
5 |
+
2024-08-12 06:34:47,378 DEBUG SenderThread:13101 [sender.py:send():382] send: run
|
6 |
+
2024-08-12 06:34:47,829 INFO SenderThread:13101 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_063447-whqmtxyq/files
|
7 |
+
2024-08-12 06:34:47,829 INFO SenderThread:13101 [sender.py:_start_run_threads():1136] run started: whqmtxyq with start time 1723412087.358797
|
8 |
+
2024-08-12 06:34:47,835 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-12 06:34:47,835 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-12 06:34:47,922 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-12 06:34:47,929 DEBUG HandlerThread:13101 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-12 06:34:47,929 DEBUG HandlerThread:13101 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-12 06:34:47,929 INFO HandlerThread:13101 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-12 06:34:47,929 INFO SystemMonitor:13101 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-12 06:34:47,929 INFO HandlerThread:13101 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-12 06:34:47,930 INFO SystemMonitor:13101 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-12 06:34:47,930 INFO SystemMonitor:13101 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-12 06:34:47,930 INFO SystemMonitor:13101 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-12 06:34:47,931 INFO SystemMonitor:13101 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-12 06:34:47,932 INFO SystemMonitor:13101 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-12 06:34:47,942 DEBUG HandlerThread:13101 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-12 06:34:47,944 DEBUG HandlerThread:13101 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-12 06:34:47,957 DEBUG HandlerThread:13101 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-12 06:34:47,957 DEBUG HandlerThread:13101 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-12 06:34:47,957 DEBUG HandlerThread:13101 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T21:34:47.942238', 'startedAt': '2024-08-11T21:34:47.345817', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1021', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/google/gemma-2-2b', '--train-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--valid-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--test-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '3', '--eval-interval', '3', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/google/gemma-2-2b', '--save', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--load', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-sample-gemma-2-2b', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-sample-gemma-2-2b_train_2024-08-12-06:34:36'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
|
26 |
+
2024-08-12 06:34:47,957 INFO HandlerThread:13101 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-12 06:34:47,957 INFO HandlerThread:13101 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-12 06:34:47,958 INFO HandlerThread:13101 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-12 06:34:47,964 DEBUG SenderThread:13101 [sender.py:send():382] send: files
|
30 |
+
2024-08-12 06:34:47,964 INFO SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-12 06:34:47,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-12 06:34:47,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: python_packages
|
33 |
+
2024-08-12 06:34:47,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
34 |
+
2024-08-12 06:34:47,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
35 |
+
2024-08-12 06:34:47,976 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-12 06:34:48,281 DEBUG SenderThread:13101 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-12 06:34:48,615 INFO wandb-upload_0:13101 [upload_job.py:push():131] Uploaded file /tmp/tmpxyme_qqmwandb/cck49p4b-wandb-metadata.json
|
38 |
+
2024-08-12 06:34:48,831 INFO Thread-12 :13101 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
39 |
+
2024-08-12 06:34:48,831 INFO Thread-12 :13101 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063447-whqmtxyq/files/requirements.txt
|
40 |
+
2024-08-12 06:34:48,832 INFO Thread-12 :13101 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-metadata.json
|
41 |
+
2024-08-12 06:34:50,832 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
42 |
+
2024-08-12 06:34:52,543 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
43 |
+
2024-08-12 06:34:52,833 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
44 |
+
2024-08-12 06:34:57,543 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
45 |
+
2024-08-12 06:35:02,544 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
46 |
+
2024-08-12 06:35:02,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
47 |
+
2024-08-12 06:35:02,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
48 |
+
2024-08-12 06:35:02,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
49 |
+
2024-08-12 06:35:08,234 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
50 |
+
2024-08-12 06:35:13,235 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
51 |
+
2024-08-12 06:35:17,973 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
52 |
+
2024-08-12 06:35:17,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
53 |
+
2024-08-12 06:35:18,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
54 |
+
2024-08-12 06:35:18,247 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
55 |
+
2024-08-12 06:35:18,849 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/config.yaml
|
56 |
+
2024-08-12 06:35:23,452 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
57 |
+
2024-08-12 06:35:28,453 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
58 |
+
2024-08-12 06:35:32,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
59 |
+
2024-08-12 06:35:32,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
60 |
+
2024-08-12 06:35:33,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
61 |
+
2024-08-12 06:35:34,202 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
62 |
+
2024-08-12 06:35:39,202 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
63 |
+
2024-08-12 06:35:44,203 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
64 |
+
2024-08-12 06:35:47,932 DEBUG SystemMonitor:13101 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
65 |
+
2024-08-12 06:35:47,934 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
|
66 |
+
2024-08-12 06:35:47,973 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
67 |
+
2024-08-12 06:35:47,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
68 |
+
2024-08-12 06:35:48,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
69 |
+
2024-08-12 06:35:49,237 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
70 |
+
2024-08-12 06:35:54,238 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
71 |
+
2024-08-12 06:35:59,239 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
72 |
+
2024-08-12 06:36:02,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
73 |
+
2024-08-12 06:36:02,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
74 |
+
2024-08-12 06:36:03,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
75 |
+
2024-08-12 06:36:05,234 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
76 |
+
2024-08-12 06:36:08,884 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
77 |
+
2024-08-12 06:36:09,915 DEBUG SenderThread:13101 [sender.py:send():382] send: config
|
78 |
+
2024-08-12 06:36:09,915 DEBUG SenderThread:13101 [sender.py:send():382] send: config
|
79 |
+
2024-08-12 06:36:10,885 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
80 |
+
2024-08-12 06:36:11,119 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
81 |
+
2024-08-12 06:36:16,120 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
82 |
+
2024-08-12 06:36:17,935 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
|
83 |
+
2024-08-12 06:36:17,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
84 |
+
2024-08-12 06:36:17,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
85 |
+
2024-08-12 06:36:17,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
86 |
+
2024-08-12 06:36:21,237 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
87 |
+
2024-08-12 06:36:21,893 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/config.yaml
|
88 |
+
2024-08-12 06:36:26,451 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
89 |
+
2024-08-12 06:36:31,452 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
90 |
+
2024-08-12 06:36:32,974 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
91 |
+
2024-08-12 06:36:32,974 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
92 |
+
2024-08-12 06:36:33,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
93 |
+
2024-08-12 06:36:37,174 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
94 |
+
2024-08-12 06:36:42,174 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
95 |
+
2024-08-12 06:36:47,175 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
96 |
+
2024-08-12 06:36:47,936 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
|
97 |
+
2024-08-12 06:36:47,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
98 |
+
2024-08-12 06:36:47,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
99 |
+
2024-08-12 06:36:48,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
100 |
+
2024-08-12 06:36:52,199 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
101 |
+
2024-08-12 06:36:57,199 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
102 |
+
2024-08-12 06:37:02,200 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
103 |
+
2024-08-12 06:37:02,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
104 |
+
2024-08-12 06:37:02,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
105 |
+
2024-08-12 06:37:03,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
106 |
+
2024-08-12 06:37:08,177 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
107 |
+
2024-08-12 06:37:13,178 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
108 |
+
2024-08-12 06:37:17,937 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
|
109 |
+
2024-08-12 06:37:17,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
110 |
+
2024-08-12 06:37:17,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
111 |
+
2024-08-12 06:37:18,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
112 |
+
2024-08-12 06:37:18,239 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
113 |
+
2024-08-12 06:37:23,240 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
114 |
+
2024-08-12 06:37:28,240 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
115 |
+
2024-08-12 06:37:32,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
116 |
+
2024-08-12 06:37:32,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
117 |
+
2024-08-12 06:37:33,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
118 |
+
2024-08-12 06:37:33,471 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: partial_history
|
119 |
+
2024-08-12 06:37:33,513 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
120 |
+
2024-08-12 06:37:34,938 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
121 |
+
2024-08-12 06:37:38,514 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
122 |
+
2024-08-12 06:37:43,515 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
123 |
+
2024-08-12 06:37:47,938 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
|
124 |
+
2024-08-12 06:37:47,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
125 |
+
2024-08-12 06:37:47,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
126 |
+
2024-08-12 06:37:47,977 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
127 |
+
2024-08-12 06:37:49,236 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
128 |
+
2024-08-12 06:37:54,236 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
129 |
+
2024-08-12 06:37:59,237 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
130 |
+
2024-08-12 06:38:02,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
131 |
+
2024-08-12 06:38:02,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
132 |
+
2024-08-12 06:38:03,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
133 |
+
2024-08-12 06:38:05,173 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
134 |
+
2024-08-12 06:38:10,174 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
135 |
+
2024-08-12 06:38:15,175 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
136 |
+
2024-08-12 06:38:17,940 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
|
137 |
+
2024-08-12 06:38:17,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
138 |
+
2024-08-12 06:38:17,975 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
139 |
+
2024-08-12 06:38:18,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
140 |
+
2024-08-12 06:38:20,189 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
141 |
+
2024-08-12 06:38:25,189 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
142 |
+
2024-08-12 06:38:30,190 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
143 |
+
2024-08-12 06:38:32,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
144 |
+
2024-08-12 06:38:32,976 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
145 |
+
2024-08-12 06:38:33,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
146 |
+
2024-08-12 06:38:36,181 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
147 |
+
2024-08-12 06:38:41,181 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
148 |
+
2024-08-12 06:38:46,182 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
149 |
+
2024-08-12 06:38:47,941 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
|
150 |
+
2024-08-12 06:38:47,975 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
151 |
+
2024-08-12 06:38:47,976 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
152 |
+
2024-08-12 06:38:48,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
153 |
+
2024-08-12 06:38:52,158 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
154 |
+
2024-08-12 06:38:57,068 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: partial_history
|
155 |
+
2024-08-12 06:38:57,070 DEBUG SenderThread:13101 [sender.py:send():382] send: history
|
156 |
+
2024-08-12 06:38:57,071 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: summary_record
|
157 |
+
2024-08-12 06:38:57,072 INFO SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
158 |
+
2024-08-12 06:38:57,991 INFO Thread-12 :13101 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
|
159 |
+
2024-08-12 06:38:58,109 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
160 |
+
2024-08-12 06:38:58,991 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
161 |
+
2024-08-12 06:39:02,976 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
162 |
+
2024-08-12 06:39:02,977 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
163 |
+
2024-08-12 06:39:02,977 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
164 |
+
2024-08-12 06:39:03,220 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
165 |
+
2024-08-12 06:39:08,220 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
166 |
+
2024-08-12 06:39:13,221 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
167 |
+
2024-08-12 06:39:17,942 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
|
168 |
+
2024-08-12 06:39:17,976 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
169 |
+
2024-08-12 06:39:17,977 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
170 |
+
2024-08-12 06:39:18,020 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
171 |
+
2024-08-12 06:39:19,166 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
172 |
+
2024-08-12 06:39:24,167 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
173 |
+
2024-08-12 06:39:29,167 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
174 |
+
2024-08-12 06:39:32,976 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
175 |
+
2024-08-12 06:39:32,976 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
176 |
+
2024-08-12 06:39:33,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
177 |
+
2024-08-12 06:39:34,262 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
178 |
+
2024-08-12 06:39:39,263 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
179 |
+
2024-08-12 06:39:44,264 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
180 |
+
2024-08-12 06:39:47,943 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
|
181 |
+
2024-08-12 06:39:47,976 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
182 |
+
2024-08-12 06:39:47,976 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
183 |
+
2024-08-12 06:39:48,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
184 |
+
2024-08-12 06:39:50,213 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
185 |
+
2024-08-12 06:39:55,214 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
186 |
+
2024-08-12 06:40:00,215 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
187 |
+
2024-08-12 06:40:02,976 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
188 |
+
2024-08-12 06:40:02,977 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
189 |
+
2024-08-12 06:40:03,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
190 |
+
2024-08-12 06:40:05,253 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
191 |
+
2024-08-12 06:40:10,254 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
192 |
+
2024-08-12 06:40:15,254 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
193 |
+
2024-08-12 06:40:17,944 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
|
194 |
+
2024-08-12 06:40:17,976 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
195 |
+
2024-08-12 06:40:17,977 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
196 |
+
2024-08-12 06:40:18,016 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
197 |
+
2024-08-12 06:40:20,601 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: partial_history
|
198 |
+
2024-08-12 06:40:20,603 DEBUG SenderThread:13101 [sender.py:send():382] send: history
|
199 |
+
2024-08-12 06:40:20,604 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: summary_record
|
200 |
+
2024-08-12 06:40:20,604 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
201 |
+
2024-08-12 06:40:20,605 INFO SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
202 |
+
2024-08-12 06:40:21,044 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
203 |
+
2024-08-12 06:40:21,045 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
|
204 |
+
2024-08-12 06:40:21,305 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: partial_history
|
205 |
+
2024-08-12 06:40:23,046 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
206 |
+
2024-08-12 06:40:26,337 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
207 |
+
2024-08-12 06:40:31,051 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
208 |
+
2024-08-12 06:40:32,226 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
209 |
+
2024-08-12 06:40:32,977 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
|
210 |
+
2024-08-12 06:40:32,977 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
|
211 |
+
2024-08-12 06:40:32,979 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
212 |
+
2024-08-12 06:40:37,055 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
213 |
+
2024-08-12 06:40:37,381 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
214 |
+
2024-08-12 06:40:42,382 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
215 |
+
2024-08-12 06:40:44,855 DEBUG SenderThread:13101 [sender.py:send():382] send: exit
|
216 |
+
2024-08-12 06:40:44,856 INFO SenderThread:13101 [sender.py:send_exit():589] handling exit code: 1
|
217 |
+
2024-08-12 06:40:44,856 INFO SenderThread:13101 [sender.py:send_exit():591] handling runtime: 356
|
218 |
+
2024-08-12 06:40:44,857 INFO SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
219 |
+
2024-08-12 06:40:44,857 INFO SenderThread:13101 [sender.py:send_exit():597] send defer
|
220 |
+
2024-08-12 06:40:44,857 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
221 |
+
2024-08-12 06:40:44,857 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 0
|
222 |
+
2024-08-12 06:40:44,857 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
223 |
+
2024-08-12 06:40:44,857 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 0
|
224 |
+
2024-08-12 06:40:44,857 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 1
|
225 |
+
2024-08-12 06:40:44,858 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
226 |
+
2024-08-12 06:40:44,858 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 1
|
227 |
+
2024-08-12 06:40:44,858 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
228 |
+
2024-08-12 06:40:44,858 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 1
|
229 |
+
2024-08-12 06:40:44,858 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 2
|
230 |
+
2024-08-12 06:40:44,858 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
231 |
+
2024-08-12 06:40:44,858 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 2
|
232 |
+
2024-08-12 06:40:44,858 INFO HandlerThread:13101 [system_monitor.py:finish():203] Stopping system monitor
|
233 |
+
2024-08-12 06:40:44,858 DEBUG SystemMonitor:13101 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
234 |
+
2024-08-12 06:40:44,858 INFO HandlerThread:13101 [interfaces.py:finish():202] Joined cpu monitor
|
235 |
+
2024-08-12 06:40:44,859 DEBUG SystemMonitor:13101 [system_monitor.py:_start():183] Publishing last batch of metrics
|
236 |
+
2024-08-12 06:40:44,859 INFO HandlerThread:13101 [interfaces.py:finish():202] Joined disk monitor
|
237 |
+
2024-08-12 06:40:44,893 INFO HandlerThread:13101 [interfaces.py:finish():202] Joined gpu monitor
|
238 |
+
2024-08-12 06:40:44,893 INFO HandlerThread:13101 [interfaces.py:finish():202] Joined memory monitor
|
239 |
+
2024-08-12 06:40:44,893 INFO HandlerThread:13101 [interfaces.py:finish():202] Joined network monitor
|
240 |
+
2024-08-12 06:40:44,894 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
241 |
+
2024-08-12 06:40:44,894 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 2
|
242 |
+
2024-08-12 06:40:44,894 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 3
|
243 |
+
2024-08-12 06:40:44,894 DEBUG SenderThread:13101 [sender.py:send():382] send: stats
|
244 |
+
2024-08-12 06:40:44,894 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
245 |
+
2024-08-12 06:40:44,894 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 3
|
246 |
+
2024-08-12 06:40:44,896 DEBUG SenderThread:13101 [sender.py:send():382] send: history
|
247 |
+
2024-08-12 06:40:44,896 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: summary_record
|
248 |
+
2024-08-12 06:40:44,897 INFO SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
249 |
+
2024-08-12 06:40:44,898 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
250 |
+
2024-08-12 06:40:44,898 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 3
|
251 |
+
2024-08-12 06:40:44,898 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 4
|
252 |
+
2024-08-12 06:40:44,898 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
253 |
+
2024-08-12 06:40:44,898 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 4
|
254 |
+
2024-08-12 06:40:44,898 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
255 |
+
2024-08-12 06:40:44,898 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 4
|
256 |
+
2024-08-12 06:40:44,898 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 5
|
257 |
+
2024-08-12 06:40:44,898 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
258 |
+
2024-08-12 06:40:44,898 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 5
|
259 |
+
2024-08-12 06:40:44,899 DEBUG SenderThread:13101 [sender.py:send():382] send: summary
|
260 |
+
2024-08-12 06:40:44,900 INFO SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
261 |
+
2024-08-12 06:40:44,900 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
262 |
+
2024-08-12 06:40:44,900 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 5
|
263 |
+
2024-08-12 06:40:44,900 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 6
|
264 |
+
2024-08-12 06:40:44,900 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
265 |
+
2024-08-12 06:40:44,900 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 6
|
266 |
+
2024-08-12 06:40:44,900 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
267 |
+
2024-08-12 06:40:44,900 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 6
|
268 |
+
2024-08-12 06:40:44,901 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 7
|
269 |
+
2024-08-12 06:40:44,901 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
|
270 |
+
2024-08-12 06:40:44,901 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
271 |
+
2024-08-12 06:40:44,901 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 7
|
272 |
+
2024-08-12 06:40:44,901 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
273 |
+
2024-08-12 06:40:44,901 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 7
|
274 |
+
2024-08-12 06:40:45,060 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
275 |
+
2024-08-12 06:40:45,061 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
|
276 |
+
2024-08-12 06:40:45,855 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
|
277 |
+
2024-08-12 06:40:47,007 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 8
|
278 |
+
2024-08-12 06:40:47,007 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
|
279 |
+
2024-08-12 06:40:47,007 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
280 |
+
2024-08-12 06:40:47,008 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 8
|
281 |
+
2024-08-12 06:40:47,008 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
282 |
+
2024-08-12 06:40:47,008 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 8
|
283 |
+
2024-08-12 06:40:47,008 INFO SenderThread:13101 [job_builder.py:build():296] Attempting to build job artifact
|
284 |
+
2024-08-12 06:40:47,009 INFO SenderThread:13101 [job_builder.py:_get_source_type():426] is repo sourced job
|
285 |
+
2024-08-12 06:40:47,023 INFO SenderThread:13101 [job_builder.py:build():402] adding wandb-job metadata file
|
286 |
+
2024-08-12 06:40:47,031 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 9
|
287 |
+
2024-08-12 06:40:47,032 DEBUG SenderThread:13101 [sender.py:send():382] send: artifact
|
288 |
+
2024-08-12 06:40:47,032 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
289 |
+
2024-08-12 06:40:47,033 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 9
|
290 |
+
2024-08-12 06:40:47,062 INFO Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
291 |
+
2024-08-12 06:40:47,856 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
|
292 |
+
2024-08-12 06:40:47,912 INFO SenderThread:13101 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
|
293 |
+
2024-08-12 06:40:47,912 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
294 |
+
2024-08-12 06:40:47,912 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 9
|
295 |
+
2024-08-12 06:40:47,913 INFO SenderThread:13101 [dir_watcher.py:finish():358] shutting down directory watcher
|
296 |
+
2024-08-12 06:40:48,063 INFO SenderThread:13101 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_063447-whqmtxyq/files
|
297 |
+
2024-08-12 06:40:48,063 INFO SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/requirements.txt requirements.txt
|
298 |
+
2024-08-12 06:40:48,063 INFO SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/config.yaml config.yaml
|
299 |
+
2024-08-12 06:40:48,065 INFO SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-metadata.json wandb-metadata.json
|
300 |
+
2024-08-12 06:40:48,065 INFO SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json wandb-summary.json
|
301 |
+
2024-08-12 06:40:48,067 INFO SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log output.log
|
302 |
+
2024-08-12 06:40:48,067 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 10
|
303 |
+
2024-08-12 06:40:48,068 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
|
304 |
+
2024-08-12 06:40:48,069 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
305 |
+
2024-08-12 06:40:48,069 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 10
|
306 |
+
2024-08-12 06:40:48,070 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
307 |
+
2024-08-12 06:40:48,071 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 10
|
308 |
+
2024-08-12 06:40:48,071 INFO SenderThread:13101 [file_pusher.py:finish():172] shutting down file pusher
|
309 |
+
2024-08-12 06:40:48,555 INFO wandb-upload_1:13101 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063447-whqmtxyq/files/config.yaml
|
310 |
+
2024-08-12 06:40:48,607 INFO wandb-upload_0:13101 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063447-whqmtxyq/files/requirements.txt
|
311 |
+
2024-08-12 06:40:48,857 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
|
312 |
+
2024-08-12 06:40:48,857 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
|
313 |
+
2024-08-12 06:40:49,047 INFO wandb-upload_2:13101 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
|
314 |
+
2024-08-12 06:40:49,065 INFO wandb-upload_3:13101 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
|
315 |
+
2024-08-12 06:40:49,265 INFO Thread-11 (_thread_body):13101 [sender.py:transition_state():617] send defer: 11
|
316 |
+
2024-08-12 06:40:49,266 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
317 |
+
2024-08-12 06:40:49,266 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 11
|
318 |
+
2024-08-12 06:40:49,266 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
319 |
+
2024-08-12 06:40:49,266 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 11
|
320 |
+
2024-08-12 06:40:49,266 INFO SenderThread:13101 [file_pusher.py:join():178] waiting for file pusher
|
321 |
+
2024-08-12 06:40:49,266 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 12
|
322 |
+
2024-08-12 06:40:49,267 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
323 |
+
2024-08-12 06:40:49,267 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 12
|
324 |
+
2024-08-12 06:40:49,267 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
325 |
+
2024-08-12 06:40:49,267 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 12
|
326 |
+
2024-08-12 06:40:49,267 INFO SenderThread:13101 [file_stream.py:finish():595] file stream finish called
|
327 |
+
2024-08-12 06:40:49,435 INFO SenderThread:13101 [file_stream.py:finish():599] file stream finish is done
|
328 |
+
2024-08-12 06:40:49,435 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 13
|
329 |
+
2024-08-12 06:40:49,436 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
330 |
+
2024-08-12 06:40:49,436 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 13
|
331 |
+
2024-08-12 06:40:49,436 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
332 |
+
2024-08-12 06:40:49,436 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 13
|
333 |
+
2024-08-12 06:40:49,436 INFO SenderThread:13101 [sender.py:transition_state():617] send defer: 14
|
334 |
+
2024-08-12 06:40:49,436 DEBUG SenderThread:13101 [sender.py:send():382] send: final
|
335 |
+
2024-08-12 06:40:49,436 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
|
336 |
+
2024-08-12 06:40:49,436 DEBUG SenderThread:13101 [sender.py:send():382] send: footer
|
337 |
+
2024-08-12 06:40:49,436 INFO HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 14
|
338 |
+
2024-08-12 06:40:49,437 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: defer
|
339 |
+
2024-08-12 06:40:49,437 INFO SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 14
|
340 |
+
2024-08-12 06:40:49,437 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
|
341 |
+
2024-08-12 06:40:49,437 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
|
342 |
+
2024-08-12 06:40:49,438 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
|
343 |
+
2024-08-12 06:40:49,438 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
|
344 |
+
2024-08-12 06:40:49,438 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: server_info
|
345 |
+
2024-08-12 06:40:49,438 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: server_info
|
346 |
+
2024-08-12 06:40:49,439 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: get_summary
|
347 |
+
2024-08-12 06:40:49,440 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: sampled_history
|
348 |
+
2024-08-12 06:40:49,442 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
|
349 |
+
2024-08-12 06:40:49,442 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: job_info
|
350 |
+
2024-08-12 06:40:49,609 DEBUG SenderThread:13101 [sender.py:send_request():409] send_request: job_info
|
351 |
+
2024-08-12 06:40:49,610 INFO MainThread:13101 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
352 |
+
2024-08-12 06:40:49,610 INFO MainThread:13101 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
353 |
+
2024-08-12 06:40:49,611 INFO MainThread:13101 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
354 |
+
2024-08-12 06:40:49,611 DEBUG HandlerThread:13101 [handler.py:handle_request():146] handle_request: shutdown
|
355 |
+
2024-08-12 06:40:49,611 INFO HandlerThread:13101 [handler.py:finish():869] shutting down handler
|
356 |
+
2024-08-12 06:40:50,442 INFO WriterThread:13101 [datastore.py:close():296] close: /project/wandb/run-20240812_063447-whqmtxyq/run-whqmtxyq.wandb
|
357 |
+
2024-08-12 06:40:50,610 INFO SenderThread:13101 [sender.py:finish():1572] shutting down sender
|
358 |
+
2024-08-12 06:40:50,610 INFO SenderThread:13101 [file_pusher.py:finish():172] shutting down file pusher
|
359 |
+
2024-08-12 06:40:50,610 INFO SenderThread:13101 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240812_063447-whqmtxyq/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-12 06:34:47,351 INFO MainThread:13030 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_setup.py:_flush():76] Configure stats pid to 13030
|
3 |
+
2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
|
6 |
+
2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_063447-whqmtxyq/logs/debug.log
|
9 |
+
2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_063447-whqmtxyq/logs/debug-internal.log
|
10 |
+
2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-12 06:34:47,352 INFO MainThread:13030 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1021, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/google/gemma-2-2b', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-gemma-2-2b_train_2024-08-12-06:34:36', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'save': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'base_model': '/share/pretrained_lm/google/gemma-2-2b', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 3, 'save_interval': 3, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-gemma-2-2b', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 256000, 'gradient_accumulation_steps': 320}
|
13 |
+
2024-08-12 06:34:47,353 INFO MainThread:13030 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-12 06:34:47,353 INFO MainThread:13030 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-12 06:34:47,357 INFO MainThread:13030 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-12 06:34:47,358 INFO MainThread:13030 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-12 06:34:47,363 INFO MainThread:13030 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-12 06:34:47,374 INFO MainThread:13030 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-12 06:34:47,834 INFO MainThread:13030 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-12 06:34:47,915 INFO MainThread:13030 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-12 06:34:47,915 INFO MainThread:13030 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-12 06:34:47,973 INFO MainThread:13030 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-12 06:34:47,973 INFO MainThread:13030 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-12 06:34:47,973 INFO MainThread:13030 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-12 06:34:47,974 INFO MainThread:13030 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-12 06:34:47,975 INFO MainThread:13030 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-12 06:36:09,914 INFO MainThread:13030 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Gemma2ForCausalLM', 'activation_function': 'gelu_pytorch_tanh', 'hidden_size': 2304, 'model_type': 'gemma2', 'max_position_embeddings': 1021, 'num_attention_heads': 8, 'num_hidden_layers': 26}
|
29 |
+
2024-08-12 06:36:09,915 INFO MainThread:13030 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
30 |
+
2024-08-12 06:40:50,612 WARNING MsgRouterThr:13030 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240812_063447-whqmtxyq/run-whqmtxyq.wandb
ADDED
Binary file (42.3 kB). View file
|
|
wandb/run-20240815_031216-0szn78ph/files/config.yaml
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '304771887'
|
31 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
32 |
+
valid_data_path:
|
33 |
+
desc: null
|
34 |
+
value:
|
35 |
+
- '304771887'
|
36 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
37 |
+
test_data_path:
|
38 |
+
desc: null
|
39 |
+
value:
|
40 |
+
- '304771887'
|
41 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
42 |
+
data_cache_path:
|
43 |
+
desc: null
|
44 |
+
value: null
|
45 |
+
vocab_size:
|
46 |
+
desc: null
|
47 |
+
value: null
|
48 |
+
vocab_file:
|
49 |
+
desc: null
|
50 |
+
value: null
|
51 |
+
merge_file:
|
52 |
+
desc: null
|
53 |
+
value: null
|
54 |
+
seq_length:
|
55 |
+
desc: null
|
56 |
+
value: 4096
|
57 |
+
num_workers:
|
58 |
+
desc: null
|
59 |
+
value: 2
|
60 |
+
tokenizer_type:
|
61 |
+
desc: null
|
62 |
+
value: HFPreTrainedTokenizer
|
63 |
+
tokenizer_model:
|
64 |
+
desc: null
|
65 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
66 |
+
reset_position_ids:
|
67 |
+
desc: null
|
68 |
+
value: false
|
69 |
+
reset_attention_mask:
|
70 |
+
desc: null
|
71 |
+
value: false
|
72 |
+
eod_mask_loss:
|
73 |
+
desc: null
|
74 |
+
value: false
|
75 |
+
retro_return_doc_ids:
|
76 |
+
desc: null
|
77 |
+
value: false
|
78 |
+
short_seq_prob:
|
79 |
+
desc: null
|
80 |
+
value: 0.1
|
81 |
+
vocab_extra_ids:
|
82 |
+
desc: null
|
83 |
+
value: 0
|
84 |
+
seed:
|
85 |
+
desc: null
|
86 |
+
value: 1234
|
87 |
+
use_mpi:
|
88 |
+
desc: null
|
89 |
+
value: false
|
90 |
+
wandb_entity:
|
91 |
+
desc: null
|
92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
93 |
+
wandb_name:
|
94 |
+
desc: null
|
95 |
+
value: yans-qwen2-0.5B_train_2024-08-15-03:11:59
|
96 |
+
wandb_project:
|
97 |
+
desc: null
|
98 |
+
value: llm_tutorial
|
99 |
+
quantization:
|
100 |
+
desc: null
|
101 |
+
value: false
|
102 |
+
use_freeze_layers:
|
103 |
+
desc: null
|
104 |
+
value: false
|
105 |
+
freeze_layers:
|
106 |
+
desc: null
|
107 |
+
value: null
|
108 |
+
bf16:
|
109 |
+
desc: null
|
110 |
+
value: true
|
111 |
+
fp16:
|
112 |
+
desc: null
|
113 |
+
value: false
|
114 |
+
mixed_precision:
|
115 |
+
desc: null
|
116 |
+
value: true
|
117 |
+
param_dtype:
|
118 |
+
desc: null
|
119 |
+
value: null
|
120 |
+
load:
|
121 |
+
desc: null
|
122 |
+
value: /work/llm_recipes/models/yans-qwen2-0.5B
|
123 |
+
save:
|
124 |
+
desc: null
|
125 |
+
value: /work/llm_recipes/models/yans-qwen2-0.5B
|
126 |
+
base_model:
|
127 |
+
desc: null
|
128 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
129 |
+
use_better_transformer:
|
130 |
+
desc: null
|
131 |
+
value: false
|
132 |
+
grad_clip_norm:
|
133 |
+
desc: null
|
134 |
+
value: 1.0
|
135 |
+
eval_interval:
|
136 |
+
desc: null
|
137 |
+
value: 10
|
138 |
+
save_interval:
|
139 |
+
desc: null
|
140 |
+
value: 10
|
141 |
+
eval_iters:
|
142 |
+
desc: null
|
143 |
+
value: 10
|
144 |
+
optimizer:
|
145 |
+
desc: null
|
146 |
+
value: adam
|
147 |
+
lr:
|
148 |
+
desc: null
|
149 |
+
value: 2.0e-05
|
150 |
+
lr_decay_style:
|
151 |
+
desc: null
|
152 |
+
value: cosine
|
153 |
+
lr_decay_iters:
|
154 |
+
desc: null
|
155 |
+
value: 20000
|
156 |
+
lr_warmup_iters:
|
157 |
+
desc: null
|
158 |
+
value: 500
|
159 |
+
min_lr:
|
160 |
+
desc: null
|
161 |
+
value: 1.0e-06
|
162 |
+
train_iters:
|
163 |
+
desc: null
|
164 |
+
value: 20000
|
165 |
+
train_samples:
|
166 |
+
desc: null
|
167 |
+
value: null
|
168 |
+
global_batch_size:
|
169 |
+
desc: null
|
170 |
+
value: 320
|
171 |
+
micro_batch_size:
|
172 |
+
desc: null
|
173 |
+
value: 1
|
174 |
+
make_vocab_size_divisible_by:
|
175 |
+
desc: null
|
176 |
+
value: 128
|
177 |
+
sliding_window_size:
|
178 |
+
desc: null
|
179 |
+
value: 4096
|
180 |
+
skip_batch:
|
181 |
+
desc: null
|
182 |
+
value: null
|
183 |
+
no_save_optimizer_state:
|
184 |
+
desc: null
|
185 |
+
value: false
|
186 |
+
continual_pretraining:
|
187 |
+
desc: null
|
188 |
+
value: false
|
189 |
+
instruction_tuning:
|
190 |
+
desc: null
|
191 |
+
value: false
|
192 |
+
direct_preference_optimization:
|
193 |
+
desc: null
|
194 |
+
value: false
|
195 |
+
attention_dropout:
|
196 |
+
desc: null
|
197 |
+
value: 0.1
|
198 |
+
hidden_dropout:
|
199 |
+
desc: null
|
200 |
+
value: 0.1
|
201 |
+
weight_decay:
|
202 |
+
desc: null
|
203 |
+
value: 0.1
|
204 |
+
adam_beta1:
|
205 |
+
desc: null
|
206 |
+
value: 0.9
|
207 |
+
adam_beta2:
|
208 |
+
desc: null
|
209 |
+
value: 0.95
|
210 |
+
adam_eps:
|
211 |
+
desc: null
|
212 |
+
value: 1.0e-06
|
213 |
+
hf_transformer_model_dir:
|
214 |
+
desc: null
|
215 |
+
value: null
|
216 |
+
instruction_train_data_path:
|
217 |
+
desc: null
|
218 |
+
value: null
|
219 |
+
instruction_valid_data_path:
|
220 |
+
desc: null
|
221 |
+
value: null
|
222 |
+
epoch:
|
223 |
+
desc: null
|
224 |
+
value: null
|
225 |
+
instruction_dataset_size:
|
226 |
+
desc: null
|
227 |
+
value: null
|
228 |
+
save_sampler_state:
|
229 |
+
desc: null
|
230 |
+
value: false
|
231 |
+
label_smoothing:
|
232 |
+
desc: null
|
233 |
+
value: 0.0
|
234 |
+
save_n_checkpoints:
|
235 |
+
desc: null
|
236 |
+
value: 10
|
237 |
+
hf_repo_id:
|
238 |
+
desc: null
|
239 |
+
value: koichi12/yans-qwen2-0.5B
|
240 |
+
create_public_hf_repo:
|
241 |
+
desc: null
|
242 |
+
value: false
|
243 |
+
upload_all_checkpoints_to_hf:
|
244 |
+
desc: null
|
245 |
+
value: false
|
246 |
+
hf_upload_retry_limit:
|
247 |
+
desc: null
|
248 |
+
value: 2
|
249 |
+
exit_duration_in_mins:
|
250 |
+
desc: null
|
251 |
+
value: null
|
252 |
+
source_key:
|
253 |
+
desc: null
|
254 |
+
value: null
|
255 |
+
target_key:
|
256 |
+
desc: null
|
257 |
+
value: null
|
258 |
+
attn_implementation:
|
259 |
+
desc: null
|
260 |
+
value: flash_attention_2
|
261 |
+
efficient_instruction_tuning:
|
262 |
+
desc: null
|
263 |
+
value: false
|
264 |
+
remove_padding_masking:
|
265 |
+
desc: null
|
266 |
+
value: false
|
267 |
+
save_start_iter:
|
268 |
+
desc: null
|
269 |
+
value: null
|
270 |
+
rank:
|
271 |
+
desc: null
|
272 |
+
value: 0
|
273 |
+
world_size:
|
274 |
+
desc: null
|
275 |
+
value: 1
|
276 |
+
padded_vocab_size:
|
277 |
+
desc: null
|
278 |
+
value: 151680
|
279 |
+
gradient_accumulation_steps:
|
280 |
+
desc: null
|
281 |
+
value: 320
|
282 |
+
_wandb:
|
283 |
+
desc: null
|
284 |
+
value:
|
285 |
+
python_version: 3.10.12
|
286 |
+
cli_version: 0.16.3
|
287 |
+
framework: huggingface
|
288 |
+
huggingface_version: 4.43.3
|
289 |
+
is_jupyter_run: false
|
290 |
+
is_kaggle_kernel: false
|
291 |
+
start_time: 1723659136.24386
|
292 |
+
t:
|
293 |
+
1:
|
294 |
+
- 1
|
295 |
+
- 11
|
296 |
+
- 49
|
297 |
+
- 55
|
298 |
+
- 71
|
299 |
+
2:
|
300 |
+
- 1
|
301 |
+
- 11
|
302 |
+
- 49
|
303 |
+
- 55
|
304 |
+
- 71
|
305 |
+
3:
|
306 |
+
- 13
|
307 |
+
- 16
|
308 |
+
- 23
|
309 |
+
4: 3.10.12
|
310 |
+
5: 0.16.3
|
311 |
+
6: 4.43.3
|
312 |
+
8:
|
313 |
+
- 5
|
314 |
+
13: linux-x86_64
|
315 |
+
model_architecture:
|
316 |
+
desc: null
|
317 |
+
value: Qwen2ForCausalLM
|
318 |
+
activation_function:
|
319 |
+
desc: null
|
320 |
+
value: silu
|
321 |
+
hidden_size:
|
322 |
+
desc: null
|
323 |
+
value: 896
|
324 |
+
model_type:
|
325 |
+
desc: null
|
326 |
+
value: qwen2
|
327 |
+
max_position_embeddings:
|
328 |
+
desc: null
|
329 |
+
value: 4096
|
330 |
+
num_attention_heads:
|
331 |
+
desc: null
|
332 |
+
value: 14
|
333 |
+
num_hidden_layers:
|
334 |
+
desc: null
|
335 |
+
value: 24
|
wandb/run-20240815_031216-0szn78ph/files/output.log
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
Loading model state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/model.pt
|
5 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
6 |
+
Loaded model state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/model.pt
|
7 |
+
--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
|
8 |
+
--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
|
9 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
10 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
11 |
+
warnings.warn(
|
12 |
+
--> applying fsdp activation checkpointing...
|
13 |
+
> datasets target sizes (minimum size):
|
14 |
+
train: 6400000
|
15 |
+
validation: 6403200
|
16 |
+
test: 3200
|
17 |
+
> building train, validation, and test datasets for GPT ...
|
18 |
+
> finished creating GPT datasets ...
|
19 |
+
Loading optimizer state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/optimizer.pt
|
20 |
+
Let split = None
|
21 |
+
Building a BlendedDataset for a single MegatronDataset
|
22 |
+
Unable to save the indexes because path_to_cache is None
|
23 |
+
Building a BlendedDataset for a single MegatronDataset
|
24 |
+
Unable to save the indexes because path_to_cache is None
|
25 |
+
Building a BlendedDataset for a single MegatronDataset
|
26 |
+
Unable to save the indexes because path_to_cache is None
|
27 |
+
Loaded optimizer state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/optimizer.pt
|
28 |
+
model info: FullyShardedDataParallel(
|
29 |
+
(_fsdp_wrapped_module): Qwen2ForCausalLM(
|
30 |
+
(model): Qwen2Model(
|
31 |
+
(embed_tokens): Embedding(151936, 896)
|
32 |
+
(layers): ModuleList(
|
33 |
+
(0-23): 24 x FullyShardedDataParallel(
|
34 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
35 |
+
(_checkpoint_wrapped_module): Qwen2DecoderLayer(
|
36 |
+
(self_attn): Qwen2FlashAttention2(
|
37 |
+
(q_proj): Linear(in_features=896, out_features=896, bias=True)
|
38 |
+
(k_proj): Linear(in_features=896, out_features=128, bias=True)
|
39 |
+
(v_proj): Linear(in_features=896, out_features=128, bias=True)
|
40 |
+
(o_proj): Linear(in_features=896, out_features=896, bias=False)
|
41 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
42 |
+
)
|
43 |
+
(mlp): Qwen2MLP(
|
44 |
+
(gate_proj): Linear(in_features=896, out_features=4864, bias=False)
|
45 |
+
(up_proj): Linear(in_features=896, out_features=4864, bias=False)
|
46 |
+
(down_proj): Linear(in_features=4864, out_features=896, bias=False)
|
47 |
+
(act_fn): SiLU()
|
48 |
+
)
|
49 |
+
(input_layernorm): Qwen2RMSNorm()
|
50 |
+
(post_attention_layernorm): Qwen2RMSNorm()
|
51 |
+
)
|
52 |
+
)
|
53 |
+
)
|
54 |
+
)
|
55 |
+
(norm): Qwen2RMSNorm()
|
56 |
+
)
|
57 |
+
(lm_head): Linear(in_features=896, out_features=151936, bias=False)
|
58 |
+
)
|
59 |
+
)
|
60 |
+
model config: Qwen2Config {
|
61 |
+
"_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
62 |
+
"architectures": [
|
63 |
+
"Qwen2ForCausalLM"
|
64 |
+
],
|
65 |
+
"attention_dropout": 0.0,
|
66 |
+
"bos_token_id": 151643,
|
67 |
+
"eos_token_id": 151643,
|
68 |
+
"hidden_act": "silu",
|
69 |
+
"hidden_size": 896,
|
70 |
+
"initializer_range": 0.02,
|
71 |
+
"intermediate_size": 4864,
|
72 |
+
"label_smoothing": 0.0,
|
73 |
+
"max_position_embeddings": 4096,
|
74 |
+
"max_window_layers": 24,
|
75 |
+
"model_type": "qwen2",
|
76 |
+
"num_attention_heads": 14,
|
77 |
+
"num_hidden_layers": 24,
|
78 |
+
"num_key_value_heads": 2,
|
79 |
+
"rms_norm_eps": 1e-06,
|
80 |
+
"rope_theta": 1000000.0,
|
81 |
+
"sliding_window": null,
|
82 |
+
"tie_word_embeddings": true,
|
83 |
+
"torch_dtype": "bfloat16",
|
84 |
+
"transformers_version": "4.43.3",
|
85 |
+
"use_cache": false,
|
86 |
+
"use_sliding_window": false,
|
87 |
+
"vocab_size": 151936
|
88 |
+
}
|
89 |
+
[rank0]:[2024-08-15 03:12:42,940] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
|
90 |
+
------------------------------------------------------------------
|
91 |
+
iteration: 1161 , TFLOPS: 67.46644597716896, Tokens per sec: 16778.56616965974, Loss: 2.442603349685669
|
92 |
+
------------------------------------------------------------------
|
wandb/run-20240815_031216-0szn78ph/files/requirements.txt
ADDED
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.33.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
apex==0.1
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
asttokens==2.4.1
|
11 |
+
astunparse==1.6.3
|
12 |
+
async-timeout==4.0.3
|
13 |
+
attrs==23.2.0
|
14 |
+
audioread==3.0.1
|
15 |
+
beautifulsoup4==4.12.3
|
16 |
+
bleach==6.1.0
|
17 |
+
blis==0.7.11
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi==2024.2.2
|
21 |
+
cffi==1.16.0
|
22 |
+
chardet==5.2.0
|
23 |
+
charset-normalizer==3.3.2
|
24 |
+
click==8.1.7
|
25 |
+
cloudpathlib==0.16.0
|
26 |
+
cloudpickle==3.0.0
|
27 |
+
cmake==3.28.1
|
28 |
+
colorama==0.4.6
|
29 |
+
comm==0.2.1
|
30 |
+
confection==0.1.4
|
31 |
+
contourpy==1.2.0
|
32 |
+
cubinlinker==0.3.0+2.g405ac64
|
33 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
34 |
+
cudf==23.12.0
|
35 |
+
cugraph-dgl==23.12.0
|
36 |
+
cugraph-service-client==23.12.0
|
37 |
+
cugraph-service-server==23.12.0
|
38 |
+
cugraph==23.12.0
|
39 |
+
cuml==23.12.0
|
40 |
+
cupy-cuda12x==12.3.0
|
41 |
+
cycler==0.12.1
|
42 |
+
cymem==2.0.8
|
43 |
+
cython==3.0.8
|
44 |
+
dask-cuda==23.12.0
|
45 |
+
dask-cudf==23.12.0
|
46 |
+
dask==2023.11.0
|
47 |
+
dataproperty==1.0.1
|
48 |
+
datasets==2.20.0
|
49 |
+
debugpy==1.8.1
|
50 |
+
decorator==5.1.1
|
51 |
+
defusedxml==0.7.1
|
52 |
+
dill==0.3.8
|
53 |
+
distributed==2023.11.0
|
54 |
+
dm-tree==0.1.8
|
55 |
+
docker-pycreds==0.4.0
|
56 |
+
einops==0.7.0
|
57 |
+
evaluate==0.4.2
|
58 |
+
exceptiongroup==1.2.0
|
59 |
+
execnet==2.0.2
|
60 |
+
executing==2.0.1
|
61 |
+
expecttest==0.1.3
|
62 |
+
fastjsonschema==2.19.1
|
63 |
+
fastrlock==0.8.2
|
64 |
+
filelock==3.13.1
|
65 |
+
flash-attn==2.4.2
|
66 |
+
fonttools==4.48.1
|
67 |
+
frozenlist==1.4.1
|
68 |
+
fsspec==2023.12.2
|
69 |
+
gast==0.5.4
|
70 |
+
gitdb==4.0.11
|
71 |
+
gitpython==3.1.43
|
72 |
+
google-auth-oauthlib==0.4.6
|
73 |
+
google-auth==2.27.0
|
74 |
+
graphsurgeon==0.4.6
|
75 |
+
grpcio==1.60.1
|
76 |
+
huggingface-hub==0.24.5
|
77 |
+
hypothesis==5.35.1
|
78 |
+
idna==3.6
|
79 |
+
importlib-metadata==7.0.1
|
80 |
+
iniconfig==2.0.0
|
81 |
+
intel-openmp==2021.4.0
|
82 |
+
ipadic==1.0.0
|
83 |
+
ipykernel==6.29.2
|
84 |
+
ipython-genutils==0.2.0
|
85 |
+
ipython==8.21.0
|
86 |
+
jedi==0.19.1
|
87 |
+
jinja2==3.1.3
|
88 |
+
joblib==1.3.2
|
89 |
+
json5==0.9.14
|
90 |
+
jsonlines==4.0.0
|
91 |
+
jsonnet==0.19.1
|
92 |
+
jsonschema-specifications==2023.12.1
|
93 |
+
jsonschema==4.21.1
|
94 |
+
jupyter-client==8.6.0
|
95 |
+
jupyter-core==5.7.1
|
96 |
+
jupyter-tensorboard==0.2.0
|
97 |
+
jupyterlab-pygments==0.3.0
|
98 |
+
jupyterlab-server==1.2.0
|
99 |
+
jupyterlab==2.3.2
|
100 |
+
jupytext==1.16.1
|
101 |
+
kiwisolver==1.4.5
|
102 |
+
langcodes==3.3.0
|
103 |
+
lazy-loader==0.3
|
104 |
+
librosa==0.10.1
|
105 |
+
llvmlite==0.40.1
|
106 |
+
lm-eval==0.4.3
|
107 |
+
locket==1.0.0
|
108 |
+
logzero==1.7.0
|
109 |
+
lxml==5.2.2
|
110 |
+
markdown-it-py==3.0.0
|
111 |
+
markdown==3.5.2
|
112 |
+
markupsafe==2.1.4
|
113 |
+
matplotlib-inline==0.1.6
|
114 |
+
matplotlib==3.8.2
|
115 |
+
mbstrdecoder==1.1.3
|
116 |
+
mdit-py-plugins==0.4.0
|
117 |
+
mdurl==0.1.2
|
118 |
+
mecab-python3==1.0.6
|
119 |
+
mistune==3.0.2
|
120 |
+
mkl-devel==2021.1.1
|
121 |
+
mkl-include==2021.1.1
|
122 |
+
mkl==2021.1.1
|
123 |
+
mock==5.1.0
|
124 |
+
more-itertools==9.1.0
|
125 |
+
mpmath==1.3.0
|
126 |
+
msgpack==1.0.7
|
127 |
+
multidict==6.0.4
|
128 |
+
multiprocess==0.70.16
|
129 |
+
murmurhash==1.0.10
|
130 |
+
nbclient==0.9.0
|
131 |
+
nbconvert==7.16.0
|
132 |
+
nbformat==5.9.2
|
133 |
+
nest-asyncio==1.6.0
|
134 |
+
networkx==2.6.3
|
135 |
+
ninja==1.11.1.1
|
136 |
+
nltk==3.8.1
|
137 |
+
notebook==6.4.10
|
138 |
+
numba==0.57.1+1.g1ff679645
|
139 |
+
numexpr==2.10.1
|
140 |
+
numpy==1.24.4
|
141 |
+
nvfuser==0.1.4a0+d0bb811
|
142 |
+
nvidia-dali-cuda120==1.34.0
|
143 |
+
nvidia-pyindex==1.0.9
|
144 |
+
nvtx==0.2.5
|
145 |
+
oauthlib==3.2.2
|
146 |
+
onnx==1.15.0rc2
|
147 |
+
opencv==4.7.0
|
148 |
+
optree==0.10.0
|
149 |
+
packaging==23.2
|
150 |
+
pandas==1.5.3
|
151 |
+
pandocfilters==1.5.1
|
152 |
+
parso==0.8.3
|
153 |
+
partd==1.4.1
|
154 |
+
pathvalidate==3.2.0
|
155 |
+
peft==0.11.1
|
156 |
+
pexpect==4.9.0
|
157 |
+
pillow==10.2.0
|
158 |
+
pip==24.0
|
159 |
+
platformdirs==4.2.0
|
160 |
+
pluggy==1.4.0
|
161 |
+
ply==3.11
|
162 |
+
polygraphy==0.49.4
|
163 |
+
pooch==1.8.0
|
164 |
+
portalocker==2.10.1
|
165 |
+
preshed==3.0.9
|
166 |
+
prettytable==3.9.0
|
167 |
+
prometheus-client==0.19.0
|
168 |
+
prompt-toolkit==3.0.43
|
169 |
+
protobuf==4.24.4
|
170 |
+
psutil==5.9.4
|
171 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
172 |
+
ptyprocess==0.7.0
|
173 |
+
pure-eval==0.2.2
|
174 |
+
pyarrow-hotfix==0.6
|
175 |
+
pyarrow==17.0.0
|
176 |
+
pyasn1-modules==0.3.0
|
177 |
+
pyasn1==0.5.1
|
178 |
+
pybind11-global==2.11.1
|
179 |
+
pybind11==2.11.1
|
180 |
+
pycocotools==2.0+nv0.8.0
|
181 |
+
pycparser==2.21
|
182 |
+
pydantic-core==2.16.2
|
183 |
+
pydantic==2.6.1
|
184 |
+
pygments==2.17.2
|
185 |
+
pylibcugraph==23.12.0
|
186 |
+
pylibcugraphops==23.12.0
|
187 |
+
pylibraft==23.12.0
|
188 |
+
pynvml==11.4.1
|
189 |
+
pyparsing==3.1.1
|
190 |
+
pytablewriter==1.2.0
|
191 |
+
pytest-flakefinder==1.1.0
|
192 |
+
pytest-rerunfailures==13.0
|
193 |
+
pytest-shard==0.1.2
|
194 |
+
pytest-xdist==3.5.0
|
195 |
+
pytest==8.0.0
|
196 |
+
python-dateutil==2.8.2
|
197 |
+
python-dotenv==1.0.0
|
198 |
+
python-hostlist==1.23.0
|
199 |
+
pytorch-quantization==2.1.2
|
200 |
+
pytz==2023.3.post1
|
201 |
+
pyyaml==6.0.1
|
202 |
+
pyzmq==25.1.2
|
203 |
+
raft-dask==23.12.0
|
204 |
+
rapids-dask-dependency==23.12.1
|
205 |
+
referencing==0.33.0
|
206 |
+
regex==2023.12.25
|
207 |
+
requests-oauthlib==1.3.1
|
208 |
+
requests==2.32.3
|
209 |
+
rich==13.7.0
|
210 |
+
rmm==23.12.0
|
211 |
+
rouge-score==0.1.2
|
212 |
+
rpds-py==0.17.1
|
213 |
+
rsa==4.9
|
214 |
+
sacrebleu==2.4.0
|
215 |
+
safetensors==0.4.3
|
216 |
+
scikit-learn==1.2.0
|
217 |
+
scipy==1.12.0
|
218 |
+
send2trash==1.8.2
|
219 |
+
sentencepiece==0.1.99
|
220 |
+
sentry-sdk==2.12.0
|
221 |
+
setproctitle==1.3.3
|
222 |
+
setuptools==68.2.2
|
223 |
+
six==1.16.0
|
224 |
+
smart-open==6.4.0
|
225 |
+
smmap==5.0.1
|
226 |
+
sortedcontainers==2.4.0
|
227 |
+
soundfile==0.12.1
|
228 |
+
soupsieve==2.5
|
229 |
+
soxr==0.3.7
|
230 |
+
spacy-legacy==3.0.12
|
231 |
+
spacy-loggers==1.0.5
|
232 |
+
spacy==3.7.2
|
233 |
+
sphinx-glpi-theme==0.6
|
234 |
+
sqlitedict==2.1.0
|
235 |
+
srsly==2.4.8
|
236 |
+
stack-data==0.6.3
|
237 |
+
sympy==1.12
|
238 |
+
tabledata==1.3.3
|
239 |
+
tabulate==0.9.0
|
240 |
+
tbb==2021.11.0
|
241 |
+
tblib==3.0.0
|
242 |
+
tcolorpy==0.1.6
|
243 |
+
tensorboard-data-server==0.6.1
|
244 |
+
tensorboard-plugin-wit==1.8.1
|
245 |
+
tensorboard==2.9.0
|
246 |
+
tensorrt==8.6.3
|
247 |
+
terminado==0.18.0
|
248 |
+
termplotlib==0.3.9
|
249 |
+
thinc==8.2.3
|
250 |
+
threadpoolctl==3.2.0
|
251 |
+
thriftpy2==0.4.17
|
252 |
+
tinycss2==1.2.1
|
253 |
+
tokenizers==0.19.1
|
254 |
+
toml==0.10.2
|
255 |
+
tomli==2.0.1
|
256 |
+
toolz==0.12.1
|
257 |
+
torch-tensorrt==2.3.0a0
|
258 |
+
torch==2.3.0a0+ebedce2
|
259 |
+
torchdata==0.7.1a0
|
260 |
+
torchtext==0.17.0a0
|
261 |
+
torchvision==0.18.0a0
|
262 |
+
tornado==6.4
|
263 |
+
tqdm-multiprocess==0.0.11
|
264 |
+
tqdm==4.66.5
|
265 |
+
traitlets==5.9.0
|
266 |
+
transformer-engine==1.3.0+5b90b7f
|
267 |
+
transformers==4.43.3
|
268 |
+
treelite-runtime==3.9.1
|
269 |
+
treelite==3.9.1
|
270 |
+
triton==2.2.0+e28a256
|
271 |
+
typepy==1.3.2
|
272 |
+
typer==0.9.0
|
273 |
+
types-dataclasses==0.6.6
|
274 |
+
typing-extensions==4.9.0
|
275 |
+
ucx-py==0.35.0
|
276 |
+
uff==0.6.9
|
277 |
+
ujson==5.8.0
|
278 |
+
urllib3==1.26.18
|
279 |
+
wandb==0.16.3
|
280 |
+
wasabi==1.1.2
|
281 |
+
wcwidth==0.2.13
|
282 |
+
weasel==0.3.4
|
283 |
+
webencodings==0.5.1
|
284 |
+
werkzeug==3.0.1
|
285 |
+
wheel==0.42.0
|
286 |
+
word2number==1.1
|
287 |
+
xdoctest==1.0.2
|
288 |
+
xgboost==1.7.6
|
289 |
+
xxhash==3.4.1
|
290 |
+
yarl==1.9.4
|
291 |
+
zict==3.0.0
|
292 |
+
zipp==3.17.0
|
293 |
+
zstandard==0.23.0
|
wandb/run-20240815_031216-0szn78ph/files/wandb-metadata.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-14T18:12:16.980997",
|
5 |
+
"startedAt": "2024-08-14T18:12:16.230100",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"4096",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"4096",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"1",
|
15 |
+
"--global-batch-size",
|
16 |
+
"320",
|
17 |
+
"--train-iters",
|
18 |
+
"20000",
|
19 |
+
"--tokenizer-type",
|
20 |
+
"HFPreTrainedTokenizer",
|
21 |
+
"--tokenizer-model",
|
22 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
23 |
+
"--train-data-path",
|
24 |
+
"304771887",
|
25 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
26 |
+
"--valid-data-path",
|
27 |
+
"304771887",
|
28 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
29 |
+
"--test-data-path",
|
30 |
+
"304771887",
|
31 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
32 |
+
"--lr",
|
33 |
+
"2e-5",
|
34 |
+
"--min-lr",
|
35 |
+
"1e-6",
|
36 |
+
"--lr-decay-style",
|
37 |
+
"cosine",
|
38 |
+
"--lr-warmup-iters",
|
39 |
+
"500",
|
40 |
+
"--lr-decay-iters",
|
41 |
+
"20000",
|
42 |
+
"--weight-decay",
|
43 |
+
"0.1",
|
44 |
+
"--grad-clip-norm",
|
45 |
+
"1.0",
|
46 |
+
"--optimizer",
|
47 |
+
"adam",
|
48 |
+
"--adam-beta1",
|
49 |
+
"0.9",
|
50 |
+
"--adam-beta2",
|
51 |
+
"0.95",
|
52 |
+
"--adam-eps",
|
53 |
+
"1e-6",
|
54 |
+
"--save-interval",
|
55 |
+
"10",
|
56 |
+
"--eval-interval",
|
57 |
+
"10",
|
58 |
+
"--eval-iters",
|
59 |
+
"10",
|
60 |
+
"--bf16",
|
61 |
+
"--mixed-precision",
|
62 |
+
"--base-model",
|
63 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
64 |
+
"--save",
|
65 |
+
"/work/llm_recipes/models/yans-qwen2-0.5B",
|
66 |
+
"--load",
|
67 |
+
"/work/llm_recipes/models/yans-qwen2-0.5B",
|
68 |
+
"--fsdp-activation-checkpointing",
|
69 |
+
"--sharding-strategy",
|
70 |
+
"FULL_SHARD",
|
71 |
+
"--checkpoint-type",
|
72 |
+
"LOCAL_STATE_DICT",
|
73 |
+
"--save-n-checkpoints",
|
74 |
+
"10",
|
75 |
+
"--hf-upload-retry-limit",
|
76 |
+
"2",
|
77 |
+
"--hf-repo-id",
|
78 |
+
"koichi12/yans-qwen2-0.5B",
|
79 |
+
"--wandb-entity",
|
80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
81 |
+
"--wandb-project",
|
82 |
+
"llm_tutorial",
|
83 |
+
"--wandb-name",
|
84 |
+
"yans-qwen2-0.5B_train_2024-08-15-03:11:59"
|
85 |
+
],
|
86 |
+
"state": "running",
|
87 |
+
"program": "/project/examples/finetuning.py",
|
88 |
+
"codePathLocal": "examples/finetuning.py",
|
89 |
+
"codePath": "examples/finetuning.py",
|
90 |
+
"git": {
|
91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
92 |
+
"commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
|
93 |
+
},
|
94 |
+
"email": null,
|
95 |
+
"root": "/project",
|
96 |
+
"host": "gpu-koiwa-00",
|
97 |
+
"username": "koiwa",
|
98 |
+
"executable": "/usr/bin/python",
|
99 |
+
"cpu_count": 18,
|
100 |
+
"cpu_count_logical": 18,
|
101 |
+
"cpu_freq": {
|
102 |
+
"current": 2400.0389999999993,
|
103 |
+
"min": 0.0,
|
104 |
+
"max": 0.0
|
105 |
+
},
|
106 |
+
"cpu_freq_per_core": [
|
107 |
+
{
|
108 |
+
"current": 2400.039,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2400.039,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.039,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.039,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.039,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.039,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.039,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.039,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.039,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.039,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.039,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.039,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.039,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.039,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.039,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.039,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.039,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.039,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"disk": {
|
199 |
+
"/": {
|
200 |
+
"total": 0.0625,
|
201 |
+
"used": 1.1444091796875e-05
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
205 |
+
"gpu_count": 1,
|
206 |
+
"gpu_devices": [
|
207 |
+
{
|
208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
209 |
+
"memory_total": 42949672960
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"memory": {
|
213 |
+
"total": 56.48782730102539
|
214 |
+
}
|
215 |
+
}
|
wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 168}, "training/loss": 2.442603349685669, "training/perplexity": 11.502947992429535, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 1161, "optimizer/lr": 1.9946184158325198e-05, "optimizer/variance_l2": 0.004682497095771901, "optimizer/variance_sqrt_l2": 0.5343142380105511, "optimizer/momentum_l2": 0.12459250428605805, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.2849578857421875, "optimizer/variance_sqrt_l1": 4625.0, "optimizer/momentum_l1": 977.875, "optimizer/weight_l1": 6918144.0, "optimizer/variance_abs_max": 0.0030059814453125, "optimizer/variance_sqrt_abs_max": 0.054931640625, "optimizer/momentum_abs_max": 0.0108642578125, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 78.13778523999986, "stats/tokens_per_sec": 16778.56616965974, "stats/tokens_per_sec_per_gpu": 16778.56616965974, "stats/tflops": 67.46644597716896, "_timestamp": 1723659241.8232834, "_runtime": 105.57942342758179, "_step": 1161}
|
wandb/run-20240815_031216-0szn78ph/logs/debug-internal.log
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-15 03:12:16,244 INFO StreamThr :10026 [internal.py:wandb_internal():86] W&B internal server running at pid: 10026, started at: 2024-08-15 03:12:16.243481
|
2 |
+
2024-08-15 03:12:16,245 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-15 03:12:16,248 INFO WriterThread:10026 [datastore.py:open_for_write():87] open: /project/wandb/run-20240815_031216-0szn78ph/run-0szn78ph.wandb
|
4 |
+
2024-08-15 03:12:16,249 DEBUG SenderThread:10026 [sender.py:send():382] send: header
|
5 |
+
2024-08-15 03:12:16,409 DEBUG SenderThread:10026 [sender.py:send():382] send: run
|
6 |
+
2024-08-15 03:12:16,887 INFO SenderThread:10026 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240815_031216-0szn78ph/files
|
7 |
+
2024-08-15 03:12:16,887 INFO SenderThread:10026 [sender.py:_start_run_threads():1136] run started: 0szn78ph with start time 1723659136.24386
|
8 |
+
2024-08-15 03:12:16,892 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-15 03:12:16,892 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-15 03:12:16,962 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-15 03:12:16,969 DEBUG HandlerThread:10026 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-15 03:12:16,969 DEBUG HandlerThread:10026 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-15 03:12:16,969 INFO HandlerThread:10026 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-15 03:12:16,969 INFO SystemMonitor:10026 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-15 03:12:16,969 INFO HandlerThread:10026 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-15 03:12:16,969 INFO SystemMonitor:10026 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-15 03:12:16,970 INFO SystemMonitor:10026 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-15 03:12:16,971 INFO SystemMonitor:10026 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-15 03:12:16,972 INFO SystemMonitor:10026 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-15 03:12:16,972 INFO SystemMonitor:10026 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-15 03:12:16,980 DEBUG HandlerThread:10026 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-15 03:12:16,983 DEBUG HandlerThread:10026 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-15 03:12:16,995 DEBUG HandlerThread:10026 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-15 03:12:16,995 DEBUG HandlerThread:10026 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-15 03:12:16,995 DEBUG HandlerThread:10026 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-14T18:12:16.980997', 'startedAt': '2024-08-14T18:12:16.230100', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-15-03:11:59'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
|
26 |
+
2024-08-15 03:12:16,995 INFO HandlerThread:10026 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-15 03:12:16,995 INFO HandlerThread:10026 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-15 03:12:16,997 INFO HandlerThread:10026 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-15 03:12:17,023 DEBUG SenderThread:10026 [sender.py:send():382] send: files
|
30 |
+
2024-08-15 03:12:17,024 INFO SenderThread:10026 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-15 03:12:17,033 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-15 03:12:17,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
|
33 |
+
2024-08-15 03:12:17,034 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: python_packages
|
34 |
+
2024-08-15 03:12:17,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
|
35 |
+
2024-08-15 03:12:17,036 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-15 03:12:17,320 DEBUG SenderThread:10026 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-15 03:12:17,786 INFO wandb-upload_0:10026 [upload_job.py:push():131] Uploaded file /tmp/tmp2lpzau9swandb/2fbn8bzg-wandb-metadata.json
|
38 |
+
2024-08-15 03:12:17,889 INFO Thread-12 :10026 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
39 |
+
2024-08-15 03:12:17,889 INFO Thread-12 :10026 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_031216-0szn78ph/files/wandb-metadata.json
|
40 |
+
2024-08-15 03:12:17,889 INFO Thread-12 :10026 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_031216-0szn78ph/files/requirements.txt
|
41 |
+
2024-08-15 03:12:19,889 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
42 |
+
2024-08-15 03:12:21,867 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
43 |
+
2024-08-15 03:12:21,890 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
44 |
+
2024-08-15 03:12:22,891 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
45 |
+
2024-08-15 03:12:26,867 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
46 |
+
2024-08-15 03:12:31,868 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
47 |
+
2024-08-15 03:12:32,032 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
|
48 |
+
2024-08-15 03:12:32,033 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
|
49 |
+
2024-08-15 03:12:32,033 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
|
50 |
+
2024-08-15 03:12:37,282 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
51 |
+
2024-08-15 03:12:37,900 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
52 |
+
2024-08-15 03:12:38,901 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
53 |
+
2024-08-15 03:12:39,901 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
54 |
+
2024-08-15 03:12:40,902 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
55 |
+
2024-08-15 03:12:42,647 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
56 |
+
2024-08-15 03:12:43,260 DEBUG SenderThread:10026 [sender.py:send():382] send: config
|
57 |
+
2024-08-15 03:12:43,261 DEBUG SenderThread:10026 [sender.py:send():382] send: config
|
58 |
+
2024-08-15 03:12:43,904 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
59 |
+
2024-08-15 03:12:44,904 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
60 |
+
2024-08-15 03:12:47,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
|
61 |
+
2024-08-15 03:12:47,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
|
62 |
+
2024-08-15 03:12:47,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
|
63 |
+
2024-08-15 03:12:48,218 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
64 |
+
2024-08-15 03:12:48,907 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/config.yaml
|
65 |
+
2024-08-15 03:12:53,411 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
66 |
+
2024-08-15 03:12:58,411 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
67 |
+
2024-08-15 03:13:02,035 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
|
68 |
+
2024-08-15 03:13:02,035 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
|
69 |
+
2024-08-15 03:13:02,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
|
70 |
+
2024-08-15 03:13:04,284 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
71 |
+
2024-08-15 03:13:09,285 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
72 |
+
2024-08-15 03:13:14,285 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
73 |
+
2024-08-15 03:13:16,973 DEBUG SystemMonitor:10026 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
74 |
+
2024-08-15 03:13:16,974 DEBUG SenderThread:10026 [sender.py:send():382] send: stats
|
75 |
+
2024-08-15 03:13:17,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
|
76 |
+
2024-08-15 03:13:17,034 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
|
77 |
+
2024-08-15 03:13:17,078 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
|
78 |
+
2024-08-15 03:13:19,286 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
79 |
+
2024-08-15 03:13:24,287 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
80 |
+
2024-08-15 03:13:29,288 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
81 |
+
2024-08-15 03:13:32,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
|
82 |
+
2024-08-15 03:13:32,034 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
|
83 |
+
2024-08-15 03:13:32,078 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
|
84 |
+
2024-08-15 03:13:35,214 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
85 |
+
2024-08-15 03:13:40,215 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
86 |
+
2024-08-15 03:13:45,216 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
87 |
+
2024-08-15 03:13:46,975 DEBUG SenderThread:10026 [sender.py:send():382] send: stats
|
88 |
+
2024-08-15 03:13:47,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
|
89 |
+
2024-08-15 03:13:47,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
|
90 |
+
2024-08-15 03:13:47,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
|
91 |
+
2024-08-15 03:13:50,291 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
92 |
+
2024-08-15 03:13:55,292 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
93 |
+
2024-08-15 03:14:00,292 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
94 |
+
2024-08-15 03:14:01,824 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: partial_history
|
95 |
+
2024-08-15 03:14:01,949 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
96 |
+
2024-08-15 03:14:02,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
|
97 |
+
2024-08-15 03:14:02,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
|
98 |
+
2024-08-15 03:14:02,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
|
99 |
+
2024-08-15 03:14:06,235 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
100 |
+
2024-08-15 03:14:11,236 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
101 |
+
2024-08-15 03:14:16,236 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
102 |
+
2024-08-15 03:14:16,976 DEBUG SenderThread:10026 [sender.py:send():382] send: stats
|
103 |
+
2024-08-15 03:14:17,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
|
104 |
+
2024-08-15 03:14:17,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
|
105 |
+
2024-08-15 03:14:17,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
|
106 |
+
2024-08-15 03:14:21,267 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
107 |
+
2024-08-15 03:14:26,267 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
108 |
+
2024-08-15 03:14:31,268 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
109 |
+
2024-08-15 03:14:32,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
|
110 |
+
2024-08-15 03:14:32,035 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
|
111 |
+
2024-08-15 03:14:32,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
|
112 |
+
2024-08-15 03:14:37,220 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
113 |
+
2024-08-15 03:14:42,221 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
114 |
+
2024-08-15 03:14:46,977 DEBUG SenderThread:10026 [sender.py:send():382] send: stats
|
115 |
+
2024-08-15 03:14:47,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
|
116 |
+
2024-08-15 03:14:47,035 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
|
117 |
+
2024-08-15 03:14:47,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
|
118 |
+
2024-08-15 03:14:48,220 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
119 |
+
2024-08-15 03:14:53,220 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
120 |
+
2024-08-15 03:14:58,221 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
121 |
+
2024-08-15 03:15:02,034 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
|
122 |
+
2024-08-15 03:15:02,035 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
|
123 |
+
2024-08-15 03:15:02,035 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
|
124 |
+
2024-08-15 03:15:03,261 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
125 |
+
2024-08-15 03:15:05,634 DEBUG SenderThread:10026 [sender.py:send():382] send: exit
|
126 |
+
2024-08-15 03:15:05,634 INFO SenderThread:10026 [sender.py:send_exit():589] handling exit code: 255
|
127 |
+
2024-08-15 03:15:05,634 INFO SenderThread:10026 [sender.py:send_exit():591] handling runtime: 168
|
128 |
+
2024-08-15 03:15:05,636 INFO SenderThread:10026 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
129 |
+
2024-08-15 03:15:05,636 INFO SenderThread:10026 [sender.py:send_exit():597] send defer
|
130 |
+
2024-08-15 03:15:05,636 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
131 |
+
2024-08-15 03:15:05,636 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 0
|
132 |
+
2024-08-15 03:15:05,636 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
133 |
+
2024-08-15 03:15:05,636 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 0
|
134 |
+
2024-08-15 03:15:05,636 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 1
|
135 |
+
2024-08-15 03:15:05,636 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
136 |
+
2024-08-15 03:15:05,637 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 1
|
137 |
+
2024-08-15 03:15:05,637 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
138 |
+
2024-08-15 03:15:05,637 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 1
|
139 |
+
2024-08-15 03:15:05,637 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 2
|
140 |
+
2024-08-15 03:15:05,637 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
141 |
+
2024-08-15 03:15:05,637 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 2
|
142 |
+
2024-08-15 03:15:05,637 INFO HandlerThread:10026 [system_monitor.py:finish():203] Stopping system monitor
|
143 |
+
2024-08-15 03:15:05,637 DEBUG SystemMonitor:10026 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
144 |
+
2024-08-15 03:15:05,637 DEBUG SystemMonitor:10026 [system_monitor.py:_start():183] Publishing last batch of metrics
|
145 |
+
2024-08-15 03:15:05,637 INFO HandlerThread:10026 [interfaces.py:finish():202] Joined cpu monitor
|
146 |
+
2024-08-15 03:15:05,639 INFO HandlerThread:10026 [interfaces.py:finish():202] Joined disk monitor
|
147 |
+
2024-08-15 03:15:05,671 INFO HandlerThread:10026 [interfaces.py:finish():202] Joined gpu monitor
|
148 |
+
2024-08-15 03:15:05,672 INFO HandlerThread:10026 [interfaces.py:finish():202] Joined memory monitor
|
149 |
+
2024-08-15 03:15:05,672 INFO HandlerThread:10026 [interfaces.py:finish():202] Joined network monitor
|
150 |
+
2024-08-15 03:15:05,672 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
151 |
+
2024-08-15 03:15:05,672 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 2
|
152 |
+
2024-08-15 03:15:05,672 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 3
|
153 |
+
2024-08-15 03:15:05,672 DEBUG SenderThread:10026 [sender.py:send():382] send: stats
|
154 |
+
2024-08-15 03:15:05,673 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
155 |
+
2024-08-15 03:15:05,673 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 3
|
156 |
+
2024-08-15 03:15:05,676 DEBUG SenderThread:10026 [sender.py:send():382] send: history
|
157 |
+
2024-08-15 03:15:05,676 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: summary_record
|
158 |
+
2024-08-15 03:15:05,677 INFO SenderThread:10026 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
159 |
+
2024-08-15 03:15:05,677 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
160 |
+
2024-08-15 03:15:05,677 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 3
|
161 |
+
2024-08-15 03:15:05,677 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 4
|
162 |
+
2024-08-15 03:15:05,677 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
163 |
+
2024-08-15 03:15:05,677 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 4
|
164 |
+
2024-08-15 03:15:05,677 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
165 |
+
2024-08-15 03:15:05,677 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 4
|
166 |
+
2024-08-15 03:15:05,677 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 5
|
167 |
+
2024-08-15 03:15:05,677 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
168 |
+
2024-08-15 03:15:05,677 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 5
|
169 |
+
2024-08-15 03:15:05,678 DEBUG SenderThread:10026 [sender.py:send():382] send: summary
|
170 |
+
2024-08-15 03:15:05,679 INFO SenderThread:10026 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
171 |
+
2024-08-15 03:15:05,679 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
172 |
+
2024-08-15 03:15:05,679 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 5
|
173 |
+
2024-08-15 03:15:05,679 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 6
|
174 |
+
2024-08-15 03:15:05,679 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
175 |
+
2024-08-15 03:15:05,679 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 6
|
176 |
+
2024-08-15 03:15:05,680 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
177 |
+
2024-08-15 03:15:05,680 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 6
|
178 |
+
2024-08-15 03:15:05,680 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 7
|
179 |
+
2024-08-15 03:15:05,680 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
180 |
+
2024-08-15 03:15:05,680 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
181 |
+
2024-08-15 03:15:05,680 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 7
|
182 |
+
2024-08-15 03:15:05,680 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
183 |
+
2024-08-15 03:15:05,680 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 7
|
184 |
+
2024-08-15 03:15:05,984 INFO Thread-12 :10026 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json
|
185 |
+
2024-08-15 03:15:06,481 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 8
|
186 |
+
2024-08-15 03:15:06,481 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
187 |
+
2024-08-15 03:15:06,481 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 8
|
188 |
+
2024-08-15 03:15:06,481 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
189 |
+
2024-08-15 03:15:06,481 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 8
|
190 |
+
2024-08-15 03:15:06,481 INFO SenderThread:10026 [job_builder.py:build():296] Attempting to build job artifact
|
191 |
+
2024-08-15 03:15:06,482 INFO SenderThread:10026 [job_builder.py:_get_source_type():426] is repo sourced job
|
192 |
+
2024-08-15 03:15:06,507 INFO SenderThread:10026 [job_builder.py:build():402] adding wandb-job metadata file
|
193 |
+
2024-08-15 03:15:06,516 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 9
|
194 |
+
2024-08-15 03:15:06,517 DEBUG SenderThread:10026 [sender.py:send():382] send: artifact
|
195 |
+
2024-08-15 03:15:06,517 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
196 |
+
2024-08-15 03:15:06,518 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 9
|
197 |
+
2024-08-15 03:15:06,633 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: poll_exit
|
198 |
+
2024-08-15 03:15:06,985 INFO Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
199 |
+
2024-08-15 03:15:08,040 INFO wandb-upload_0:10026 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpb932s___
|
200 |
+
2024-08-15 03:15:08,047 INFO wandb-upload_1:10026 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpl85vnluw
|
201 |
+
2024-08-15 03:15:09,160 INFO SenderThread:10026 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE1MDEyMDEwMQ==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
|
202 |
+
2024-08-15 03:15:09,160 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
203 |
+
2024-08-15 03:15:09,160 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 9
|
204 |
+
2024-08-15 03:15:09,160 INFO SenderThread:10026 [dir_watcher.py:finish():358] shutting down directory watcher
|
205 |
+
2024-08-15 03:15:09,986 INFO SenderThread:10026 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240815_031216-0szn78ph/files
|
206 |
+
2024-08-15 03:15:09,987 INFO SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/requirements.txt requirements.txt
|
207 |
+
2024-08-15 03:15:09,987 INFO SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/config.yaml config.yaml
|
208 |
+
2024-08-15 03:15:09,988 INFO SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/wandb-metadata.json wandb-metadata.json
|
209 |
+
2024-08-15 03:15:09,989 INFO SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json wandb-summary.json
|
210 |
+
2024-08-15 03:15:09,990 INFO SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/output.log output.log
|
211 |
+
2024-08-15 03:15:09,992 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 10
|
212 |
+
2024-08-15 03:15:09,992 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: poll_exit
|
213 |
+
2024-08-15 03:15:09,992 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
214 |
+
2024-08-15 03:15:09,993 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 10
|
215 |
+
2024-08-15 03:15:09,994 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
216 |
+
2024-08-15 03:15:09,994 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 10
|
217 |
+
2024-08-15 03:15:09,994 INFO SenderThread:10026 [file_pusher.py:finish():172] shutting down file pusher
|
218 |
+
2024-08-15 03:15:10,399 INFO wandb-upload_1:10026 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240815_031216-0szn78ph/files/config.yaml
|
219 |
+
2024-08-15 03:15:10,439 INFO wandb-upload_0:10026 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240815_031216-0szn78ph/files/requirements.txt
|
220 |
+
2024-08-15 03:15:10,453 INFO wandb-upload_2:10026 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json
|
221 |
+
2024-08-15 03:15:10,537 INFO wandb-upload_3:10026 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240815_031216-0szn78ph/files/output.log
|
222 |
+
2024-08-15 03:15:10,635 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: poll_exit
|
223 |
+
2024-08-15 03:15:10,635 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: poll_exit
|
224 |
+
2024-08-15 03:15:10,737 INFO Thread-11 (_thread_body):10026 [sender.py:transition_state():617] send defer: 11
|
225 |
+
2024-08-15 03:15:10,738 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
226 |
+
2024-08-15 03:15:10,738 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 11
|
227 |
+
2024-08-15 03:15:10,738 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
228 |
+
2024-08-15 03:15:10,738 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 11
|
229 |
+
2024-08-15 03:15:10,738 INFO SenderThread:10026 [file_pusher.py:join():178] waiting for file pusher
|
230 |
+
2024-08-15 03:15:10,738 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 12
|
231 |
+
2024-08-15 03:15:10,738 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
232 |
+
2024-08-15 03:15:10,738 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 12
|
233 |
+
2024-08-15 03:15:10,738 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
234 |
+
2024-08-15 03:15:10,738 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 12
|
235 |
+
2024-08-15 03:15:10,738 INFO SenderThread:10026 [file_stream.py:finish():595] file stream finish called
|
236 |
+
2024-08-15 03:15:11,367 INFO SenderThread:10026 [file_stream.py:finish():599] file stream finish is done
|
237 |
+
2024-08-15 03:15:11,368 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 13
|
238 |
+
2024-08-15 03:15:11,368 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
239 |
+
2024-08-15 03:15:11,368 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 13
|
240 |
+
2024-08-15 03:15:11,368 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
241 |
+
2024-08-15 03:15:11,368 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 13
|
242 |
+
2024-08-15 03:15:11,368 INFO SenderThread:10026 [sender.py:transition_state():617] send defer: 14
|
243 |
+
2024-08-15 03:15:11,369 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
|
244 |
+
2024-08-15 03:15:11,369 DEBUG SenderThread:10026 [sender.py:send():382] send: final
|
245 |
+
2024-08-15 03:15:11,369 INFO HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 14
|
246 |
+
2024-08-15 03:15:11,369 DEBUG SenderThread:10026 [sender.py:send():382] send: footer
|
247 |
+
2024-08-15 03:15:11,369 DEBUG SenderThread:10026 [sender.py:send_request():409] send_request: defer
|
248 |
+
2024-08-15 03:15:11,369 INFO SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 14
|
249 |
+
2024-08-15 03:15:14,370 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
250 |
+
2024-08-15 03:15:19,370 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
251 |
+
2024-08-15 03:15:24,371 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
252 |
+
2024-08-15 03:15:29,371 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
253 |
+
2024-08-15 03:15:34,372 DEBUG HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
|
254 |
+
2024-08-15 03:15:37,452 WARNING StreamThr :10026 [internal.py:is_dead():414] Internal process exiting, parent pid 9957 disappeared
|
255 |
+
2024-08-15 03:15:37,452 ERROR StreamThr :10026 [internal.py:wandb_internal():152] Internal process shutdown.
|
256 |
+
2024-08-15 03:15:38,372 INFO SenderThread:10026 [sender.py:finish():1572] shutting down sender
|
257 |
+
2024-08-15 03:15:38,372 INFO SenderThread:10026 [file_pusher.py:finish():172] shutting down file pusher
|
258 |
+
2024-08-15 03:15:38,372 INFO SenderThread:10026 [file_pusher.py:join():178] waiting for file pusher
|
259 |
+
2024-08-15 03:15:38,372 INFO WriterThread:10026 [datastore.py:close():296] close: /project/wandb/run-20240815_031216-0szn78ph/run-0szn78ph.wandb
|
260 |
+
2024-08-15 03:15:38,373 INFO HandlerThread:10026 [handler.py:finish():869] shutting down handler
|
wandb/run-20240815_031216-0szn78ph/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Configure stats pid to 9957
|
3 |
+
2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
|
6 |
+
2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240815_031216-0szn78ph/logs/debug.log
|
9 |
+
2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240815_031216-0szn78ph/logs/debug-internal.log
|
10 |
+
2024-08-15 03:12:16,236 INFO MainThread:9957 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-15 03:12:16,237 INFO MainThread:9957 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-15-03:11:59', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
|
13 |
+
2024-08-15 03:12:16,237 INFO MainThread:9957 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-15 03:12:16,237 INFO MainThread:9957 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-15 03:12:16,241 INFO MainThread:9957 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-15 03:12:16,243 INFO MainThread:9957 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-15 03:12:16,248 INFO MainThread:9957 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-15 03:12:16,405 INFO MainThread:9957 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-15 03:12:16,892 INFO MainThread:9957 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-15 03:12:16,915 INFO MainThread:9957 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-15 03:12:16,915 INFO MainThread:9957 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-15 03:12:17,032 INFO MainThread:9957 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-15 03:12:17,033 INFO MainThread:9957 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-15 03:12:17,033 INFO MainThread:9957 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-15 03:12:17,033 INFO MainThread:9957 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-15 03:12:17,034 INFO MainThread:9957 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-15 03:12:43,259 INFO MainThread:9957 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
|
29 |
+
2024-08-15 03:12:43,260 INFO MainThread:9957 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
wandb/run-20240815_031216-0szn78ph/run-0szn78ph.wandb
ADDED
Binary file (21.9 kB). View file
|
|
wandb/run-20240823_162543-eroprw00/files/config.yaml
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
sharding_strategy:
|
4 |
+
desc: null
|
5 |
+
value: FULL_SHARD
|
6 |
+
checkpoint_type:
|
7 |
+
desc: null
|
8 |
+
value: LOCAL_STATE_DICT
|
9 |
+
fsdp_activation_checkpointing:
|
10 |
+
desc: null
|
11 |
+
value: true
|
12 |
+
fsdp_cpu_offload:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
low_cpu_fsdp:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
no_meta_device:
|
19 |
+
desc: null
|
20 |
+
value: false
|
21 |
+
data_path:
|
22 |
+
desc: null
|
23 |
+
value: null
|
24 |
+
split:
|
25 |
+
desc: null
|
26 |
+
value: 969, 30, 1
|
27 |
+
train_data_path:
|
28 |
+
desc: null
|
29 |
+
value:
|
30 |
+
- '1754785366'
|
31 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
32 |
+
- '28623823675'
|
33 |
+
- /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
|
34 |
+
valid_data_path:
|
35 |
+
desc: null
|
36 |
+
value:
|
37 |
+
- '1754785366'
|
38 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
39 |
+
test_data_path:
|
40 |
+
desc: null
|
41 |
+
value:
|
42 |
+
- '1754785366'
|
43 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
44 |
+
data_cache_path:
|
45 |
+
desc: null
|
46 |
+
value: null
|
47 |
+
vocab_size:
|
48 |
+
desc: null
|
49 |
+
value: null
|
50 |
+
vocab_file:
|
51 |
+
desc: null
|
52 |
+
value: null
|
53 |
+
merge_file:
|
54 |
+
desc: null
|
55 |
+
value: null
|
56 |
+
seq_length:
|
57 |
+
desc: null
|
58 |
+
value: 4096
|
59 |
+
num_workers:
|
60 |
+
desc: null
|
61 |
+
value: 2
|
62 |
+
tokenizer_type:
|
63 |
+
desc: null
|
64 |
+
value: HFPreTrainedTokenizer
|
65 |
+
tokenizer_model:
|
66 |
+
desc: null
|
67 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
68 |
+
reset_position_ids:
|
69 |
+
desc: null
|
70 |
+
value: false
|
71 |
+
reset_attention_mask:
|
72 |
+
desc: null
|
73 |
+
value: false
|
74 |
+
eod_mask_loss:
|
75 |
+
desc: null
|
76 |
+
value: false
|
77 |
+
retro_return_doc_ids:
|
78 |
+
desc: null
|
79 |
+
value: false
|
80 |
+
short_seq_prob:
|
81 |
+
desc: null
|
82 |
+
value: 0.1
|
83 |
+
vocab_extra_ids:
|
84 |
+
desc: null
|
85 |
+
value: 0
|
86 |
+
seed:
|
87 |
+
desc: null
|
88 |
+
value: 1234
|
89 |
+
use_mpi:
|
90 |
+
desc: null
|
91 |
+
value: false
|
92 |
+
wandb_entity:
|
93 |
+
desc: null
|
94 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
95 |
+
wandb_name:
|
96 |
+
desc: null
|
97 |
+
value: Qwen2-0.5b-0.2_train_2024-08-23-16:25:30
|
98 |
+
wandb_project:
|
99 |
+
desc: null
|
100 |
+
value: llm_tutorial-0.2
|
101 |
+
quantization:
|
102 |
+
desc: null
|
103 |
+
value: false
|
104 |
+
use_freeze_layers:
|
105 |
+
desc: null
|
106 |
+
value: false
|
107 |
+
freeze_layers:
|
108 |
+
desc: null
|
109 |
+
value: null
|
110 |
+
bf16:
|
111 |
+
desc: null
|
112 |
+
value: true
|
113 |
+
fp16:
|
114 |
+
desc: null
|
115 |
+
value: false
|
116 |
+
mixed_precision:
|
117 |
+
desc: null
|
118 |
+
value: true
|
119 |
+
param_dtype:
|
120 |
+
desc: null
|
121 |
+
value: null
|
122 |
+
load:
|
123 |
+
desc: null
|
124 |
+
value: /work/llm_recipes/models/Qwen2-0.5b-0.2
|
125 |
+
save:
|
126 |
+
desc: null
|
127 |
+
value: /work/llm_recipes/models/Qwen2-0.5b-0.2
|
128 |
+
base_model:
|
129 |
+
desc: null
|
130 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
131 |
+
use_better_transformer:
|
132 |
+
desc: null
|
133 |
+
value: false
|
134 |
+
grad_clip_norm:
|
135 |
+
desc: null
|
136 |
+
value: 1.0
|
137 |
+
eval_interval:
|
138 |
+
desc: null
|
139 |
+
value: 10
|
140 |
+
save_interval:
|
141 |
+
desc: null
|
142 |
+
value: 10
|
143 |
+
eval_iters:
|
144 |
+
desc: null
|
145 |
+
value: 10
|
146 |
+
optimizer:
|
147 |
+
desc: null
|
148 |
+
value: anyprecision
|
149 |
+
lr:
|
150 |
+
desc: null
|
151 |
+
value: 2.0e-05
|
152 |
+
lr_decay_style:
|
153 |
+
desc: null
|
154 |
+
value: cosine
|
155 |
+
lr_decay_iters:
|
156 |
+
desc: null
|
157 |
+
value: 7500
|
158 |
+
lr_warmup_iters:
|
159 |
+
desc: null
|
160 |
+
value: 500
|
161 |
+
min_lr:
|
162 |
+
desc: null
|
163 |
+
value: 1.0e-06
|
164 |
+
train_iters:
|
165 |
+
desc: null
|
166 |
+
value: 7500
|
167 |
+
train_samples:
|
168 |
+
desc: null
|
169 |
+
value: null
|
170 |
+
global_batch_size:
|
171 |
+
desc: null
|
172 |
+
value: 640
|
173 |
+
micro_batch_size:
|
174 |
+
desc: null
|
175 |
+
value: 5
|
176 |
+
make_vocab_size_divisible_by:
|
177 |
+
desc: null
|
178 |
+
value: 128
|
179 |
+
sliding_window_size:
|
180 |
+
desc: null
|
181 |
+
value: 131072
|
182 |
+
skip_batch:
|
183 |
+
desc: null
|
184 |
+
value: null
|
185 |
+
no_save_optimizer_state:
|
186 |
+
desc: null
|
187 |
+
value: false
|
188 |
+
continual_pretraining:
|
189 |
+
desc: null
|
190 |
+
value: false
|
191 |
+
instruction_tuning:
|
192 |
+
desc: null
|
193 |
+
value: false
|
194 |
+
direct_preference_optimization:
|
195 |
+
desc: null
|
196 |
+
value: false
|
197 |
+
attention_dropout:
|
198 |
+
desc: null
|
199 |
+
value: 0.1
|
200 |
+
hidden_dropout:
|
201 |
+
desc: null
|
202 |
+
value: 0.1
|
203 |
+
weight_decay:
|
204 |
+
desc: null
|
205 |
+
value: 0.1
|
206 |
+
adam_beta1:
|
207 |
+
desc: null
|
208 |
+
value: 0.9
|
209 |
+
adam_beta2:
|
210 |
+
desc: null
|
211 |
+
value: 0.95
|
212 |
+
adam_eps:
|
213 |
+
desc: null
|
214 |
+
value: 1.0e-06
|
215 |
+
hf_transformer_model_dir:
|
216 |
+
desc: null
|
217 |
+
value: null
|
218 |
+
instruction_train_data_path:
|
219 |
+
desc: null
|
220 |
+
value: null
|
221 |
+
instruction_valid_data_path:
|
222 |
+
desc: null
|
223 |
+
value: null
|
224 |
+
epoch:
|
225 |
+
desc: null
|
226 |
+
value: null
|
227 |
+
instruction_dataset_size:
|
228 |
+
desc: null
|
229 |
+
value: null
|
230 |
+
save_sampler_state:
|
231 |
+
desc: null
|
232 |
+
value: false
|
233 |
+
label_smoothing:
|
234 |
+
desc: null
|
235 |
+
value: 0.0
|
236 |
+
save_n_checkpoints:
|
237 |
+
desc: null
|
238 |
+
value: 10
|
239 |
+
hf_repo_id:
|
240 |
+
desc: null
|
241 |
+
value: koichi12/Qwen2-0.5b-0.2
|
242 |
+
create_public_hf_repo:
|
243 |
+
desc: null
|
244 |
+
value: false
|
245 |
+
upload_all_checkpoints_to_hf:
|
246 |
+
desc: null
|
247 |
+
value: true
|
248 |
+
hf_upload_retry_limit:
|
249 |
+
desc: null
|
250 |
+
value: 2
|
251 |
+
exit_duration_in_mins:
|
252 |
+
desc: null
|
253 |
+
value: null
|
254 |
+
source_key:
|
255 |
+
desc: null
|
256 |
+
value: null
|
257 |
+
target_key:
|
258 |
+
desc: null
|
259 |
+
value: null
|
260 |
+
attn_implementation:
|
261 |
+
desc: null
|
262 |
+
value: flash_attention_2
|
263 |
+
efficient_instruction_tuning:
|
264 |
+
desc: null
|
265 |
+
value: false
|
266 |
+
remove_padding_masking:
|
267 |
+
desc: null
|
268 |
+
value: false
|
269 |
+
save_start_iter:
|
270 |
+
desc: null
|
271 |
+
value: null
|
272 |
+
valid_micro_batch_size:
|
273 |
+
desc: null
|
274 |
+
value: 1
|
275 |
+
rank:
|
276 |
+
desc: null
|
277 |
+
value: 0
|
278 |
+
world_size:
|
279 |
+
desc: null
|
280 |
+
value: 1
|
281 |
+
padded_vocab_size:
|
282 |
+
desc: null
|
283 |
+
value: 151680
|
284 |
+
gradient_accumulation_steps:
|
285 |
+
desc: null
|
286 |
+
value: 128
|
287 |
+
_wandb:
|
288 |
+
desc: null
|
289 |
+
value:
|
290 |
+
python_version: 3.10.12
|
291 |
+
cli_version: 0.16.3
|
292 |
+
framework: huggingface
|
293 |
+
huggingface_version: 4.43.3
|
294 |
+
is_jupyter_run: false
|
295 |
+
is_kaggle_kernel: false
|
296 |
+
start_time: 1724397943.202675
|
297 |
+
t:
|
298 |
+
1:
|
299 |
+
- 1
|
300 |
+
- 11
|
301 |
+
- 49
|
302 |
+
- 55
|
303 |
+
- 71
|
304 |
+
- 105
|
305 |
+
2:
|
306 |
+
- 1
|
307 |
+
- 11
|
308 |
+
- 49
|
309 |
+
- 55
|
310 |
+
- 71
|
311 |
+
- 105
|
312 |
+
3:
|
313 |
+
- 13
|
314 |
+
- 16
|
315 |
+
- 23
|
316 |
+
4: 3.10.12
|
317 |
+
5: 0.16.3
|
318 |
+
6: 4.43.3
|
319 |
+
8:
|
320 |
+
- 5
|
321 |
+
13: linux-x86_64
|
322 |
+
model_architecture:
|
323 |
+
desc: null
|
324 |
+
value: Qwen2ForCausalLM
|
325 |
+
activation_function:
|
326 |
+
desc: null
|
327 |
+
value: silu
|
328 |
+
hidden_size:
|
329 |
+
desc: null
|
330 |
+
value: 896
|
331 |
+
model_type:
|
332 |
+
desc: null
|
333 |
+
value: qwen2
|
334 |
+
max_position_embeddings:
|
335 |
+
desc: null
|
336 |
+
value: 4096
|
337 |
+
num_attention_heads:
|
338 |
+
desc: null
|
339 |
+
value: 14
|
340 |
+
num_hidden_layers:
|
341 |
+
desc: null
|
342 |
+
value: 24
|
wandb/run-20240823_162543-eroprw00/files/output.log
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
|
2 |
+
Clearing GPU cache for all ranks
|
3 |
+
--> Running with torch torch_distributed debug set to detail
|
4 |
+
Loading model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
|
5 |
+
Loaded model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
|
6 |
+
--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
|
7 |
+
--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
|
8 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
9 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
10 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
11 |
+
warnings.warn(
|
12 |
+
Let split = None
|
13 |
+
--> applying fsdp activation checkpointing...
|
14 |
+
> datasets target sizes (minimum size):
|
15 |
+
train: 4800000
|
16 |
+
validation: 4806400
|
17 |
+
test: 6400
|
18 |
+
> building train, validation, and test datasets for GPT ...
|
19 |
+
Unable to save the indexes because path_to_cache is None
|
20 |
+
> finished creating GPT datasets ...
|
21 |
+
Loading optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
|
22 |
+
Building a BlendedDataset for a single MegatronDataset
|
23 |
+
Unable to save the indexes because path_to_cache is None
|
24 |
+
Building a BlendedDataset for a single MegatronDataset
|
25 |
+
Unable to save the indexes because path_to_cache is None
|
26 |
+
Loaded optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
|
27 |
+
model info: FullyShardedDataParallel(
|
28 |
+
(_fsdp_wrapped_module): Qwen2ForCausalLM(
|
29 |
+
(model): Qwen2Model(
|
30 |
+
(embed_tokens): Embedding(151936, 896)
|
31 |
+
(layers): ModuleList(
|
32 |
+
(0-23): 24 x FullyShardedDataParallel(
|
33 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
34 |
+
(_checkpoint_wrapped_module): Qwen2DecoderLayer(
|
35 |
+
(self_attn): Qwen2FlashAttention2(
|
36 |
+
(q_proj): Linear(in_features=896, out_features=896, bias=True)
|
37 |
+
(k_proj): Linear(in_features=896, out_features=128, bias=True)
|
38 |
+
(v_proj): Linear(in_features=896, out_features=128, bias=True)
|
39 |
+
(o_proj): Linear(in_features=896, out_features=896, bias=False)
|
40 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
41 |
+
)
|
42 |
+
(mlp): Qwen2MLP(
|
43 |
+
(gate_proj): Linear(in_features=896, out_features=4864, bias=False)
|
44 |
+
(up_proj): Linear(in_features=896, out_features=4864, bias=False)
|
45 |
+
(down_proj): Linear(in_features=4864, out_features=896, bias=False)
|
46 |
+
(act_fn): SiLU()
|
47 |
+
)
|
48 |
+
(input_layernorm): Qwen2RMSNorm()
|
49 |
+
(post_attention_layernorm): Qwen2RMSNorm()
|
50 |
+
)
|
51 |
+
)
|
52 |
+
)
|
53 |
+
)
|
54 |
+
(norm): Qwen2RMSNorm()
|
55 |
+
)
|
56 |
+
(lm_head): Linear(in_features=896, out_features=151936, bias=False)
|
57 |
+
)
|
58 |
+
)
|
59 |
+
model config: Qwen2Config {
|
60 |
+
"_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
61 |
+
"architectures": [
|
62 |
+
"Qwen2ForCausalLM"
|
63 |
+
],
|
64 |
+
"attention_dropout": 0.0,
|
65 |
+
"bos_token_id": 151643,
|
66 |
+
"eos_token_id": 151643,
|
67 |
+
"hidden_act": "silu",
|
68 |
+
"hidden_size": 896,
|
69 |
+
"initializer_range": 0.02,
|
70 |
+
"intermediate_size": 4864,
|
71 |
+
"label_smoothing": 0.0,
|
72 |
+
"max_position_embeddings": 4096,
|
73 |
+
"max_window_layers": 24,
|
74 |
+
"model_type": "qwen2",
|
75 |
+
"num_attention_heads": 14,
|
76 |
+
"num_hidden_layers": 24,
|
77 |
+
"num_key_value_heads": 2,
|
78 |
+
"rms_norm_eps": 1e-06,
|
79 |
+
"rope_theta": 1000000.0,
|
80 |
+
"sliding_window": 131072,
|
81 |
+
"tie_word_embeddings": true,
|
82 |
+
"torch_dtype": "bfloat16",
|
83 |
+
"transformers_version": "4.43.3",
|
84 |
+
"use_cache": false,
|
85 |
+
"use_sliding_window": false,
|
86 |
+
"vocab_size": 151936
|
87 |
+
}
|
88 |
+
[rank0]:[2024-08-23 16:25:50,866] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling: defaultdict(<class 'float'>, {})
|
89 |
+
Traceback (most recent call last):
|
90 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
91 |
+
main()
|
92 |
+
File "/project/src/llama_recipes/finetuning.py", line 282, in main
|
93 |
+
train(
|
94 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
|
95 |
+
loss: torch.Tensor = model(**batch).loss
|
96 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
97 |
+
return self._call_impl(*args, **kwargs)
|
98 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
99 |
+
return forward_call(*args, **kwargs)
|
100 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
|
101 |
+
output = self._fsdp_wrapped_module(*args, **kwargs)
|
102 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
103 |
+
return self._call_impl(*args, **kwargs)
|
104 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
105 |
+
return forward_call(*args, **kwargs)
|
106 |
+
File "/project/lib/transformers/src/transformers/models/qwen2/modeling_qwen2.py", line 1082, in forward
|
107 |
+
loss = loss_fct(shift_logits, shift_labels)
|
108 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
109 |
+
return self._call_impl(*args, **kwargs)
|
110 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
111 |
+
return forward_call(*args, **kwargs)
|
112 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/loss.py", line 1179, in forward
|
113 |
+
return F.cross_entropy(input, target, weight=self.weight,
|
114 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 3086, in cross_entropy
|
115 |
+
return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
|
116 |
+
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 11.59 GiB. GPU 0 has a total capacity of 39.39 GiB of which 11.28 GiB is free. Including non-PyTorch memory, this process has 28.11 GiB memory in use. Of the allocated memory 26.94 GiB is allocated by PyTorch, and 363.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
wandb/run-20240823_162543-eroprw00/files/requirements.txt
ADDED
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.23.0
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
antlr4-python3-runtime==4.9.3
|
7 |
+
anyio==4.4.0
|
8 |
+
apex==0.1
|
9 |
+
appdirs==1.4.4
|
10 |
+
argon2-cffi-bindings==21.2.0
|
11 |
+
argon2-cffi==23.1.0
|
12 |
+
astroid==3.2.4
|
13 |
+
asttokens==2.4.1
|
14 |
+
astunparse==1.6.3
|
15 |
+
async-timeout==4.0.3
|
16 |
+
attrs==23.2.0
|
17 |
+
audioread==3.0.1
|
18 |
+
beautifulsoup4==4.12.3
|
19 |
+
bert-score==0.3.13
|
20 |
+
bleach==6.1.0
|
21 |
+
blis==0.7.11
|
22 |
+
build==1.2.1
|
23 |
+
cachecontrol==0.14.0
|
24 |
+
cachetools==5.3.2
|
25 |
+
catalogue==2.0.10
|
26 |
+
certifi==2024.2.2
|
27 |
+
cffi==1.16.0
|
28 |
+
chardet==5.2.0
|
29 |
+
charset-normalizer==3.3.2
|
30 |
+
cleo==2.1.0
|
31 |
+
click==8.1.7
|
32 |
+
cloudpathlib==0.16.0
|
33 |
+
cloudpickle==3.0.0
|
34 |
+
cmake==3.28.1
|
35 |
+
colorama==0.4.6
|
36 |
+
comm==0.2.1
|
37 |
+
confection==0.1.4
|
38 |
+
contourpy==1.2.0
|
39 |
+
cramjam==2.8.3
|
40 |
+
crashtest==0.4.1
|
41 |
+
cryptography==43.0.0
|
42 |
+
cubinlinker==0.3.0+2.g405ac64
|
43 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
44 |
+
cudf==23.12.0
|
45 |
+
cugraph-dgl==23.12.0
|
46 |
+
cugraph-service-client==23.12.0
|
47 |
+
cugraph-service-server==23.12.0
|
48 |
+
cugraph==23.12.0
|
49 |
+
cuml==23.12.0
|
50 |
+
cupy-cuda12x==12.3.0
|
51 |
+
cycler==0.12.1
|
52 |
+
cymem==2.0.8
|
53 |
+
cython==3.0.8
|
54 |
+
dask-cuda==23.12.0
|
55 |
+
dask-cudf==23.12.0
|
56 |
+
dask==2023.11.0
|
57 |
+
dataclasses-json==0.6.7
|
58 |
+
dataproperty==1.0.1
|
59 |
+
datasets==2.20.0
|
60 |
+
debugpy==1.8.1
|
61 |
+
decorator==5.1.1
|
62 |
+
defusedxml==0.7.1
|
63 |
+
dill==0.3.8
|
64 |
+
distlib==0.3.8
|
65 |
+
distributed==2023.11.0
|
66 |
+
distro==1.9.0
|
67 |
+
dm-tree==0.1.8
|
68 |
+
docker-pycreds==0.4.0
|
69 |
+
dulwich==0.21.7
|
70 |
+
einops==0.7.0
|
71 |
+
emoji==2.12.1
|
72 |
+
entmax==1.3
|
73 |
+
evaluate==0.4.2
|
74 |
+
exceptiongroup==1.2.0
|
75 |
+
execnet==2.0.2
|
76 |
+
executing==2.0.1
|
77 |
+
expecttest==0.1.3
|
78 |
+
fastjsonschema==2.19.1
|
79 |
+
fastparquet==2023.10.1
|
80 |
+
fastrlock==0.8.2
|
81 |
+
filelock==3.13.1
|
82 |
+
flash-attn==2.4.2
|
83 |
+
fonttools==4.48.1
|
84 |
+
frozenlist==1.4.1
|
85 |
+
fsspec==2023.12.2
|
86 |
+
fugashi==1.3.2
|
87 |
+
fuzzywuzzy==0.18.0
|
88 |
+
gast==0.5.4
|
89 |
+
gitdb==4.0.11
|
90 |
+
gitpython==3.1.43
|
91 |
+
google-auth-oauthlib==0.4.6
|
92 |
+
google-auth==2.27.0
|
93 |
+
graphsurgeon==0.4.6
|
94 |
+
greenlet==3.0.3
|
95 |
+
grpcio==1.60.1
|
96 |
+
h11==0.14.0
|
97 |
+
httpcore==1.0.5
|
98 |
+
httpx==0.27.0
|
99 |
+
huggingface-hub==0.24.5
|
100 |
+
hydra-core==1.3.2
|
101 |
+
hypothesis==5.35.1
|
102 |
+
idna==3.6
|
103 |
+
importlib-metadata==7.0.1
|
104 |
+
iniconfig==2.0.0
|
105 |
+
installer==0.7.0
|
106 |
+
intel-openmp==2021.4.0
|
107 |
+
ipadic==1.0.0
|
108 |
+
ipykernel==6.29.2
|
109 |
+
ipython-genutils==0.2.0
|
110 |
+
ipython==8.21.0
|
111 |
+
isort==5.13.2
|
112 |
+
jaraco.classes==3.4.0
|
113 |
+
jedi==0.19.1
|
114 |
+
jeepney==0.8.0
|
115 |
+
jinja2==3.1.3
|
116 |
+
jiter==0.5.0
|
117 |
+
joblib==1.3.2
|
118 |
+
json5==0.9.14
|
119 |
+
jsonargparse==3.13.1
|
120 |
+
jsonlines==4.0.0
|
121 |
+
jsonnet==0.19.1
|
122 |
+
jsonpatch==1.33
|
123 |
+
jsonpointer==3.0.0
|
124 |
+
jsonschema-specifications==2023.12.1
|
125 |
+
jsonschema==4.21.1
|
126 |
+
jupyter-client==8.6.0
|
127 |
+
jupyter-core==5.7.1
|
128 |
+
jupyter-tensorboard==0.2.0
|
129 |
+
jupyterlab-pygments==0.3.0
|
130 |
+
jupyterlab-server==1.2.0
|
131 |
+
jupyterlab==2.3.2
|
132 |
+
jupytext==1.16.1
|
133 |
+
keyring==24.3.1
|
134 |
+
kiwisolver==1.4.5
|
135 |
+
langchain-community==0.2.12
|
136 |
+
langchain-core==0.2.31
|
137 |
+
langchain-huggingface==0.0.2
|
138 |
+
langchain-openai==0.1.21
|
139 |
+
langchain-text-splitters==0.2.2
|
140 |
+
langchain==0.2.13
|
141 |
+
langcodes==3.3.0
|
142 |
+
langsmith==0.1.99
|
143 |
+
lazy-loader==0.3
|
144 |
+
levenshtein==0.25.1
|
145 |
+
librosa==0.10.1
|
146 |
+
lightning-utilities==0.11.6
|
147 |
+
llm-jp-eval==1.4.0
|
148 |
+
llvmlite==0.40.1
|
149 |
+
lm-eval==0.3.0
|
150 |
+
locket==1.0.0
|
151 |
+
logzero==1.7.0
|
152 |
+
lxml==5.2.2
|
153 |
+
markdown-it-py==3.0.0
|
154 |
+
markdown==3.5.2
|
155 |
+
markupsafe==2.1.4
|
156 |
+
marshmallow==3.21.3
|
157 |
+
matplotlib-inline==0.1.6
|
158 |
+
matplotlib==3.8.2
|
159 |
+
mbstrdecoder==1.1.3
|
160 |
+
mccabe==0.7.0
|
161 |
+
mdit-py-plugins==0.4.0
|
162 |
+
mdurl==0.1.2
|
163 |
+
mecab-python3==1.0.6
|
164 |
+
mistune==3.0.2
|
165 |
+
mkl-devel==2021.1.1
|
166 |
+
mkl-include==2021.1.1
|
167 |
+
mkl==2021.1.1
|
168 |
+
mock==5.1.0
|
169 |
+
mojimoji==0.0.13
|
170 |
+
more-itertools==9.1.0
|
171 |
+
mpmath==1.3.0
|
172 |
+
msgpack==1.0.7
|
173 |
+
multidict==6.0.4
|
174 |
+
multiprocess==0.70.16
|
175 |
+
murmurhash==1.0.10
|
176 |
+
mypy-extensions==1.0.0
|
177 |
+
nbclient==0.9.0
|
178 |
+
nbconvert==7.16.0
|
179 |
+
nbformat==5.9.2
|
180 |
+
neologdn==0.5.3
|
181 |
+
nest-asyncio==1.6.0
|
182 |
+
networkx==2.6.3
|
183 |
+
ninja==1.11.1.1
|
184 |
+
nltk==3.8.1
|
185 |
+
notebook==6.4.10
|
186 |
+
numba==0.57.1+1.g1ff679645
|
187 |
+
numexpr==2.10.1
|
188 |
+
numpy==1.24.4
|
189 |
+
nvfuser==0.1.4a0+d0bb811
|
190 |
+
nvidia-dali-cuda120==1.34.0
|
191 |
+
nvidia-pyindex==1.0.9
|
192 |
+
nvtx==0.2.5
|
193 |
+
oauthlib==3.2.2
|
194 |
+
omegaconf==2.3.0
|
195 |
+
onnx==1.15.0rc2
|
196 |
+
openai==1.40.6
|
197 |
+
opencv==4.7.0
|
198 |
+
optree==0.10.0
|
199 |
+
orjson==3.10.7
|
200 |
+
packaging==23.2
|
201 |
+
pandas==2.2.2
|
202 |
+
pandocfilters==1.5.1
|
203 |
+
parso==0.8.3
|
204 |
+
partd==1.4.1
|
205 |
+
pathvalidate==3.2.0
|
206 |
+
peft==0.5.0
|
207 |
+
pexpect==4.9.0
|
208 |
+
pillow==10.2.0
|
209 |
+
pip==24.0
|
210 |
+
pkginfo==1.11.1
|
211 |
+
plac==1.4.3
|
212 |
+
platformdirs==4.2.0
|
213 |
+
pluggy==1.4.0
|
214 |
+
ply==3.11
|
215 |
+
poetry-core==1.9.0
|
216 |
+
poetry-plugin-export==1.8.0
|
217 |
+
poetry==1.8.3
|
218 |
+
polygraphy==0.49.4
|
219 |
+
pooch==1.8.0
|
220 |
+
portalocker==2.10.1
|
221 |
+
preshed==3.0.9
|
222 |
+
prettytable==3.9.0
|
223 |
+
prometheus-client==0.19.0
|
224 |
+
prompt-toolkit==3.0.43
|
225 |
+
protobuf==4.24.4
|
226 |
+
psutil==5.9.4
|
227 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
228 |
+
ptyprocess==0.7.0
|
229 |
+
pure-eval==0.2.2
|
230 |
+
pyarrow-hotfix==0.6
|
231 |
+
pyarrow==15.0.2
|
232 |
+
pyasn1-modules==0.3.0
|
233 |
+
pyasn1==0.5.1
|
234 |
+
pybind11-global==2.11.1
|
235 |
+
pybind11==2.11.1
|
236 |
+
pycocotools==2.0+nv0.8.0
|
237 |
+
pycountry==24.6.1
|
238 |
+
pycparser==2.21
|
239 |
+
pydantic-core==2.16.2
|
240 |
+
pydantic==2.6.1
|
241 |
+
pygments==2.17.2
|
242 |
+
pylibcugraph==23.12.0
|
243 |
+
pylibcugraphops==23.12.0
|
244 |
+
pylibraft==23.12.0
|
245 |
+
pylint==3.2.6
|
246 |
+
pynvml==11.4.1
|
247 |
+
pyparsing==3.1.1
|
248 |
+
pyproject-hooks==1.1.0
|
249 |
+
pytablewriter==1.2.0
|
250 |
+
pytest-flakefinder==1.1.0
|
251 |
+
pytest-rerunfailures==13.0
|
252 |
+
pytest-shard==0.1.2
|
253 |
+
pytest-xdist==3.5.0
|
254 |
+
pytest==8.0.0
|
255 |
+
python-dateutil==2.8.2
|
256 |
+
python-dotenv==1.0.0
|
257 |
+
python-hostlist==1.23.0
|
258 |
+
python-levenshtein==0.25.1
|
259 |
+
pytorch-lightning==2.4.0
|
260 |
+
pytorch-quantization==2.1.2
|
261 |
+
pytz==2023.3.post1
|
262 |
+
pyyaml==6.0.1
|
263 |
+
pyzmq==25.1.2
|
264 |
+
raft-dask==23.12.0
|
265 |
+
rapidfuzz==3.9.6
|
266 |
+
rapids-dask-dependency==23.12.1
|
267 |
+
referencing==0.33.0
|
268 |
+
regex==2023.12.25
|
269 |
+
requests-oauthlib==1.3.1
|
270 |
+
requests-toolbelt==1.0.0
|
271 |
+
requests==2.32.3
|
272 |
+
rhoknp==1.7.0
|
273 |
+
rich==13.7.0
|
274 |
+
rmm==23.12.0
|
275 |
+
rouge-score==0.1.2
|
276 |
+
rpds-py==0.17.1
|
277 |
+
rsa==4.9
|
278 |
+
sacrebleu==2.4.2
|
279 |
+
safetensors==0.4.3
|
280 |
+
scikit-learn==1.5.1
|
281 |
+
scipy==1.12.0
|
282 |
+
secretstorage==3.3.3
|
283 |
+
send2trash==1.8.2
|
284 |
+
sentence-transformers==3.0.1
|
285 |
+
sentencepiece==0.1.99
|
286 |
+
sentry-sdk==2.12.0
|
287 |
+
setproctitle==1.3.3
|
288 |
+
setuptools==68.2.2
|
289 |
+
shellingham==1.5.4
|
290 |
+
six==1.16.0
|
291 |
+
smart-open==6.4.0
|
292 |
+
smmap==5.0.1
|
293 |
+
sniffio==1.3.1
|
294 |
+
sortedcontainers==2.4.0
|
295 |
+
soundfile==0.12.1
|
296 |
+
soupsieve==2.5
|
297 |
+
soxr==0.3.7
|
298 |
+
spacy-legacy==3.0.12
|
299 |
+
spacy-loggers==1.0.5
|
300 |
+
spacy==3.7.2
|
301 |
+
sphinx-glpi-theme==0.6
|
302 |
+
sqlalchemy==2.0.32
|
303 |
+
sqlitedict==2.1.0
|
304 |
+
srsly==2.4.8
|
305 |
+
stack-data==0.6.3
|
306 |
+
sumeval==0.2.2
|
307 |
+
sympy==1.12
|
308 |
+
tabledata==1.3.3
|
309 |
+
tabulate==0.9.0
|
310 |
+
tbb==2021.11.0
|
311 |
+
tblib==3.0.0
|
312 |
+
tcolorpy==0.1.6
|
313 |
+
tenacity==8.5.0
|
314 |
+
tensorboard-data-server==0.6.1
|
315 |
+
tensorboard-plugin-wit==1.8.1
|
316 |
+
tensorboard==2.9.0
|
317 |
+
tensorrt==8.6.3
|
318 |
+
terminado==0.18.0
|
319 |
+
termplotlib==0.3.9
|
320 |
+
text-generation==0.7.0
|
321 |
+
thinc==8.2.3
|
322 |
+
threadpoolctl==3.2.0
|
323 |
+
thriftpy2==0.4.17
|
324 |
+
tiktoken==0.7.0
|
325 |
+
tinycss2==1.2.1
|
326 |
+
tokenizers==0.19.1
|
327 |
+
toml==0.10.2
|
328 |
+
tomli==2.0.1
|
329 |
+
tomlkit==0.13.2
|
330 |
+
toolz==0.12.1
|
331 |
+
torch-tensorrt==2.3.0a0
|
332 |
+
torch==2.3.0a0+ebedce2
|
333 |
+
torchdata==0.7.1a0
|
334 |
+
torchmetrics==0.10.3
|
335 |
+
torchtext==0.17.0a0
|
336 |
+
torchvision==0.18.0a0
|
337 |
+
tornado==6.4
|
338 |
+
tqdm-multiprocess==0.0.11
|
339 |
+
tqdm==4.66.5
|
340 |
+
traitlets==5.9.0
|
341 |
+
transformer-engine==1.3.0+5b90b7f
|
342 |
+
transformers==4.43.3
|
343 |
+
treelite-runtime==3.9.1
|
344 |
+
treelite==3.9.1
|
345 |
+
triton==2.2.0+e28a256
|
346 |
+
trove-classifiers==2024.7.2
|
347 |
+
typepy==1.3.2
|
348 |
+
typer==0.9.0
|
349 |
+
types-dataclasses==0.6.6
|
350 |
+
typing-extensions==4.12.2
|
351 |
+
typing-inspect==0.9.0
|
352 |
+
tzdata==2024.1
|
353 |
+
ucx-py==0.35.0
|
354 |
+
uff==0.6.9
|
355 |
+
ujson==5.8.0
|
356 |
+
unbabel-comet==2.2.2
|
357 |
+
unidic-lite==1.0.8
|
358 |
+
urllib3==1.26.18
|
359 |
+
virtualenv==20.26.3
|
360 |
+
wandb==0.16.3
|
361 |
+
wasabi==1.1.2
|
362 |
+
wcwidth==0.2.13
|
363 |
+
weasel==0.3.4
|
364 |
+
webencodings==0.5.1
|
365 |
+
werkzeug==3.0.1
|
366 |
+
wheel==0.42.0
|
367 |
+
word2number==1.1
|
368 |
+
xdoctest==1.0.2
|
369 |
+
xgboost==1.7.6
|
370 |
+
xmltodict==0.13.0
|
371 |
+
xxhash==3.4.1
|
372 |
+
yarl==1.9.4
|
373 |
+
zict==3.0.0
|
374 |
+
zipp==3.17.0
|
375 |
+
zstandard==0.23.0
|
wandb/run-20240823_162543-eroprw00/files/wandb-metadata.json
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"heartbeatAt": "2024-08-23T07:25:43.758914",
|
5 |
+
"startedAt": "2024-08-23T07:25:43.187250",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"--seq-length",
|
10 |
+
"4096",
|
11 |
+
"--sliding-window-size",
|
12 |
+
"131072",
|
13 |
+
"--micro-batch-size",
|
14 |
+
"5",
|
15 |
+
"--valid_micro_batch_size",
|
16 |
+
"1",
|
17 |
+
"--global-batch-size",
|
18 |
+
"640",
|
19 |
+
"--train-iters",
|
20 |
+
"7500",
|
21 |
+
"--tokenizer-type",
|
22 |
+
"HFPreTrainedTokenizer",
|
23 |
+
"--tokenizer-model",
|
24 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
25 |
+
"--train-data-path",
|
26 |
+
"1754785366",
|
27 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
28 |
+
"28623823675",
|
29 |
+
"/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
|
30 |
+
"--valid-data-path",
|
31 |
+
"1754785366",
|
32 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
33 |
+
"--test-data-path",
|
34 |
+
"1754785366",
|
35 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
36 |
+
"--lr",
|
37 |
+
"2e-5",
|
38 |
+
"--min-lr",
|
39 |
+
"1e-6",
|
40 |
+
"--lr-decay-style",
|
41 |
+
"cosine",
|
42 |
+
"--lr-warmup-iters",
|
43 |
+
"500",
|
44 |
+
"--lr-decay-iters",
|
45 |
+
"7500",
|
46 |
+
"--weight-decay",
|
47 |
+
"0.1",
|
48 |
+
"--grad-clip-norm",
|
49 |
+
"1.0",
|
50 |
+
"--optimizer",
|
51 |
+
"anyprecision",
|
52 |
+
"--adam-beta1",
|
53 |
+
"0.9",
|
54 |
+
"--adam-beta2",
|
55 |
+
"0.95",
|
56 |
+
"--adam-eps",
|
57 |
+
"1e-6",
|
58 |
+
"--save-interval",
|
59 |
+
"10",
|
60 |
+
"--eval-interval",
|
61 |
+
"10",
|
62 |
+
"--eval-iters",
|
63 |
+
"10",
|
64 |
+
"--bf16",
|
65 |
+
"--mixed-precision",
|
66 |
+
"--base-model",
|
67 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
68 |
+
"--save",
|
69 |
+
"/work/llm_recipes/models/Qwen2-0.5b-0.2",
|
70 |
+
"--load",
|
71 |
+
"/work/llm_recipes/models/Qwen2-0.5b-0.2",
|
72 |
+
"--fsdp-activation-checkpointing",
|
73 |
+
"--sharding-strategy",
|
74 |
+
"FULL_SHARD",
|
75 |
+
"--checkpoint-type",
|
76 |
+
"LOCAL_STATE_DICT",
|
77 |
+
"--save-n-checkpoints",
|
78 |
+
"10",
|
79 |
+
"--upload-all-checkpoints-to-hf",
|
80 |
+
"--hf-upload-retry-limit",
|
81 |
+
"2",
|
82 |
+
"--hf-repo-id",
|
83 |
+
"koichi12/Qwen2-0.5b-0.2",
|
84 |
+
"--wandb-entity",
|
85 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
86 |
+
"--wandb-project",
|
87 |
+
"llm_tutorial-0.2",
|
88 |
+
"--wandb-name",
|
89 |
+
"Qwen2-0.5b-0.2_train_2024-08-23-16:25:30"
|
90 |
+
],
|
91 |
+
"state": "running",
|
92 |
+
"program": "/project/examples/finetuning.py",
|
93 |
+
"codePathLocal": "examples/finetuning.py",
|
94 |
+
"codePath": "examples/finetuning.py",
|
95 |
+
"git": {
|
96 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
97 |
+
"commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
|
98 |
+
},
|
99 |
+
"email": null,
|
100 |
+
"root": "/project",
|
101 |
+
"host": "gpu-koiwa-00",
|
102 |
+
"username": "koiwa",
|
103 |
+
"executable": "/usr/bin/python",
|
104 |
+
"cpu_count": 18,
|
105 |
+
"cpu_count_logical": 18,
|
106 |
+
"cpu_freq": {
|
107 |
+
"current": 2400.0389999999993,
|
108 |
+
"min": 0.0,
|
109 |
+
"max": 0.0
|
110 |
+
},
|
111 |
+
"cpu_freq_per_core": [
|
112 |
+
{
|
113 |
+
"current": 2400.039,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2400.039,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2400.039,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2400.039,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2400.039,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2400.039,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2400.039,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2400.039,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2400.039,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2400.039,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2400.039,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2400.039,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2400.039,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2400.039,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2400.039,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2400.039,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2400.039,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"current": 2400.039,
|
199 |
+
"min": 0.0,
|
200 |
+
"max": 0.0
|
201 |
+
}
|
202 |
+
],
|
203 |
+
"disk": {
|
204 |
+
"/": {
|
205 |
+
"total": 0.0625,
|
206 |
+
"used": 1.1444091796875e-05
|
207 |
+
}
|
208 |
+
},
|
209 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
210 |
+
"gpu_count": 1,
|
211 |
+
"gpu_devices": [
|
212 |
+
{
|
213 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
214 |
+
"memory_total": 42949672960
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"memory": {
|
218 |
+
"total": 56.487831115722656
|
219 |
+
}
|
220 |
+
}
|
wandb/run-20240823_162543-eroprw00/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb": {"runtime": 8}}
|
wandb/run-20240823_162543-eroprw00/logs/debug-internal.log
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-23 16:25:43,204 INFO StreamThr :11284 [internal.py:wandb_internal():86] W&B internal server running at pid: 11284, started at: 2024-08-23 16:25:43.204013
|
2 |
+
2024-08-23 16:25:43,206 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-08-23 16:25:43,207 INFO WriterThread:11284 [datastore.py:open_for_write():87] open: /project/wandb/run-20240823_162543-eroprw00/run-eroprw00.wandb
|
4 |
+
2024-08-23 16:25:43,208 DEBUG SenderThread:11284 [sender.py:send():382] send: header
|
5 |
+
2024-08-23 16:25:43,222 DEBUG SenderThread:11284 [sender.py:send():382] send: run
|
6 |
+
2024-08-23 16:25:43,662 INFO SenderThread:11284 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240823_162543-eroprw00/files
|
7 |
+
2024-08-23 16:25:43,662 INFO SenderThread:11284 [sender.py:_start_run_threads():1136] run started: eroprw00 with start time 1724397943.202675
|
8 |
+
2024-08-23 16:25:43,667 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-08-23 16:25:43,668 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-08-23 16:25:43,739 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-08-23 16:25:43,746 DEBUG HandlerThread:11284 [system_info.py:__init__():27] System info init
|
12 |
+
2024-08-23 16:25:43,746 DEBUG HandlerThread:11284 [system_info.py:__init__():42] System info init done
|
13 |
+
2024-08-23 16:25:43,746 INFO HandlerThread:11284 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-08-23 16:25:43,746 INFO SystemMonitor:11284 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-08-23 16:25:43,746 INFO HandlerThread:11284 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-08-23 16:25:43,746 INFO SystemMonitor:11284 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-08-23 16:25:43,747 INFO SystemMonitor:11284 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-08-23 16:25:43,747 INFO SystemMonitor:11284 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-08-23 16:25:43,748 INFO SystemMonitor:11284 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-08-23 16:25:43,749 INFO SystemMonitor:11284 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-08-23 16:25:43,758 DEBUG HandlerThread:11284 [system_info.py:probe():151] Probing system
|
22 |
+
2024-08-23 16:25:43,760 DEBUG HandlerThread:11284 [system_info.py:_probe_git():136] Probing git
|
23 |
+
2024-08-23 16:25:43,773 DEBUG HandlerThread:11284 [system_info.py:_probe_git():144] Probing git done
|
24 |
+
2024-08-23 16:25:43,773 DEBUG HandlerThread:11284 [system_info.py:probe():199] Probing system done
|
25 |
+
2024-08-23 16:25:43,773 DEBUG HandlerThread:11284 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-23T07:25:43.758914', 'startedAt': '2024-08-23T07:25:43.187250', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '131072', '--micro-batch-size', '5', '--valid_micro_batch_size', '1', '--global-batch-size', '640', '--train-iters', '7500', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--test-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '7500', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--load', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/Qwen2-0.5b-0.2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial-0.2', '--wandb-name', 'Qwen2-0.5b-0.2_train_2024-08-23-16:25:30'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487831115722656}}
|
26 |
+
2024-08-23 16:25:43,773 INFO HandlerThread:11284 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-08-23 16:25:43,773 INFO HandlerThread:11284 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-08-23 16:25:43,774 INFO HandlerThread:11284 [system_monitor.py:probe():229] Finished publishing system info
|
29 |
+
2024-08-23 16:25:43,780 DEBUG SenderThread:11284 [sender.py:send():382] send: files
|
30 |
+
2024-08-23 16:25:43,780 INFO SenderThread:11284 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
31 |
+
2024-08-23 16:25:43,791 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: python_packages
|
32 |
+
2024-08-23 16:25:43,791 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: internal_messages
|
33 |
+
2024-08-23 16:25:43,792 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: stop_status
|
34 |
+
2024-08-23 16:25:43,792 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: python_packages
|
35 |
+
2024-08-23 16:25:43,794 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: stop_status
|
36 |
+
2024-08-23 16:25:44,074 DEBUG SenderThread:11284 [sender.py:send():382] send: telemetry
|
37 |
+
2024-08-23 16:25:44,478 INFO wandb-upload_0:11284 [upload_job.py:push():131] Uploaded file /tmp/tmpn8dztdufwandb/9bfyl56b-wandb-metadata.json
|
38 |
+
2024-08-23 16:25:44,664 INFO Thread-12 :11284 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162543-eroprw00/files/wandb-metadata.json
|
39 |
+
2024-08-23 16:25:44,664 INFO Thread-12 :11284 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162543-eroprw00/files/requirements.txt
|
40 |
+
2024-08-23 16:25:44,664 INFO Thread-12 :11284 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162543-eroprw00/files/output.log
|
41 |
+
2024-08-23 16:25:46,664 INFO Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
|
42 |
+
2024-08-23 16:25:48,665 INFO Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
|
43 |
+
2024-08-23 16:25:49,201 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: status_report
|
44 |
+
2024-08-23 16:25:50,667 INFO Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
|
45 |
+
2024-08-23 16:25:51,139 DEBUG SenderThread:11284 [sender.py:send():382] send: config
|
46 |
+
2024-08-23 16:25:51,140 DEBUG SenderThread:11284 [sender.py:send():382] send: config
|
47 |
+
2024-08-23 16:25:52,592 DEBUG SenderThread:11284 [sender.py:send():382] send: exit
|
48 |
+
2024-08-23 16:25:52,592 INFO SenderThread:11284 [sender.py:send_exit():589] handling exit code: 1
|
49 |
+
2024-08-23 16:25:52,592 INFO SenderThread:11284 [sender.py:send_exit():591] handling runtime: 8
|
50 |
+
2024-08-23 16:25:52,593 INFO SenderThread:11284 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
51 |
+
2024-08-23 16:25:52,594 INFO SenderThread:11284 [sender.py:send_exit():597] send defer
|
52 |
+
2024-08-23 16:25:52,594 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
53 |
+
2024-08-23 16:25:52,594 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 0
|
54 |
+
2024-08-23 16:25:52,594 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
55 |
+
2024-08-23 16:25:52,594 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 0
|
56 |
+
2024-08-23 16:25:52,594 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 1
|
57 |
+
2024-08-23 16:25:52,594 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
58 |
+
2024-08-23 16:25:52,594 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 1
|
59 |
+
2024-08-23 16:25:52,595 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
60 |
+
2024-08-23 16:25:52,595 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 1
|
61 |
+
2024-08-23 16:25:52,595 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 2
|
62 |
+
2024-08-23 16:25:52,595 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
63 |
+
2024-08-23 16:25:52,595 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 2
|
64 |
+
2024-08-23 16:25:52,595 INFO HandlerThread:11284 [system_monitor.py:finish():203] Stopping system monitor
|
65 |
+
2024-08-23 16:25:52,595 DEBUG SystemMonitor:11284 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
66 |
+
2024-08-23 16:25:52,595 INFO HandlerThread:11284 [interfaces.py:finish():202] Joined cpu monitor
|
67 |
+
2024-08-23 16:25:52,595 DEBUG SystemMonitor:11284 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
68 |
+
2024-08-23 16:25:52,595 INFO HandlerThread:11284 [interfaces.py:finish():202] Joined disk monitor
|
69 |
+
2024-08-23 16:25:52,596 DEBUG SystemMonitor:11284 [system_monitor.py:_start():183] Publishing last batch of metrics
|
70 |
+
2024-08-23 16:25:52,629 INFO HandlerThread:11284 [interfaces.py:finish():202] Joined gpu monitor
|
71 |
+
2024-08-23 16:25:52,629 INFO HandlerThread:11284 [interfaces.py:finish():202] Joined memory monitor
|
72 |
+
2024-08-23 16:25:52,629 INFO HandlerThread:11284 [interfaces.py:finish():202] Joined network monitor
|
73 |
+
2024-08-23 16:25:52,629 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
74 |
+
2024-08-23 16:25:52,629 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 2
|
75 |
+
2024-08-23 16:25:52,629 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 3
|
76 |
+
2024-08-23 16:25:52,629 DEBUG SenderThread:11284 [sender.py:send():382] send: stats
|
77 |
+
2024-08-23 16:25:52,629 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
78 |
+
2024-08-23 16:25:52,630 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 3
|
79 |
+
2024-08-23 16:25:52,630 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
80 |
+
2024-08-23 16:25:52,630 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 3
|
81 |
+
2024-08-23 16:25:52,630 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 4
|
82 |
+
2024-08-23 16:25:52,630 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
83 |
+
2024-08-23 16:25:52,630 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 4
|
84 |
+
2024-08-23 16:25:52,630 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
85 |
+
2024-08-23 16:25:52,630 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 4
|
86 |
+
2024-08-23 16:25:52,630 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 5
|
87 |
+
2024-08-23 16:25:52,630 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
88 |
+
2024-08-23 16:25:52,631 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 5
|
89 |
+
2024-08-23 16:25:52,631 DEBUG SenderThread:11284 [sender.py:send():382] send: summary
|
90 |
+
2024-08-23 16:25:52,632 INFO SenderThread:11284 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
91 |
+
2024-08-23 16:25:52,632 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
92 |
+
2024-08-23 16:25:52,632 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 5
|
93 |
+
2024-08-23 16:25:52,632 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 6
|
94 |
+
2024-08-23 16:25:52,632 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
95 |
+
2024-08-23 16:25:52,632 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 6
|
96 |
+
2024-08-23 16:25:52,632 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
97 |
+
2024-08-23 16:25:52,632 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 6
|
98 |
+
2024-08-23 16:25:52,635 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: status_report
|
99 |
+
2024-08-23 16:25:52,668 INFO Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
|
100 |
+
2024-08-23 16:25:52,668 INFO Thread-12 :11284 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162543-eroprw00/files/wandb-summary.json
|
101 |
+
2024-08-23 16:25:52,831 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 7
|
102 |
+
2024-08-23 16:25:52,831 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
103 |
+
2024-08-23 16:25:52,831 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 7
|
104 |
+
2024-08-23 16:25:52,831 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
105 |
+
2024-08-23 16:25:52,831 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 7
|
106 |
+
2024-08-23 16:25:53,592 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: poll_exit
|
107 |
+
2024-08-23 16:25:53,669 INFO Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/config.yaml
|
108 |
+
2024-08-23 16:25:54,373 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 8
|
109 |
+
2024-08-23 16:25:54,374 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: poll_exit
|
110 |
+
2024-08-23 16:25:54,374 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
111 |
+
2024-08-23 16:25:54,374 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 8
|
112 |
+
2024-08-23 16:25:54,374 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
113 |
+
2024-08-23 16:25:54,374 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 8
|
114 |
+
2024-08-23 16:25:54,374 INFO SenderThread:11284 [job_builder.py:build():296] Attempting to build job artifact
|
115 |
+
2024-08-23 16:25:54,375 INFO SenderThread:11284 [job_builder.py:_get_source_type():426] is repo sourced job
|
116 |
+
2024-08-23 16:25:54,389 INFO SenderThread:11284 [job_builder.py:build():402] adding wandb-job metadata file
|
117 |
+
2024-08-23 16:25:54,398 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 9
|
118 |
+
2024-08-23 16:25:54,398 DEBUG SenderThread:11284 [sender.py:send():382] send: artifact
|
119 |
+
2024-08-23 16:25:54,398 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
120 |
+
2024-08-23 16:25:54,399 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 9
|
121 |
+
2024-08-23 16:25:54,593 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: poll_exit
|
122 |
+
2024-08-23 16:25:54,670 INFO Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
|
123 |
+
2024-08-23 16:25:55,372 INFO SenderThread:11284 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MTk5MDU4OQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjQ1ODQ1MA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE2MTk5MDU4OQ==', 'versionIndex': 2}}}
|
124 |
+
2024-08-23 16:25:55,372 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
125 |
+
2024-08-23 16:25:55,372 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 9
|
126 |
+
2024-08-23 16:25:55,372 INFO SenderThread:11284 [dir_watcher.py:finish():358] shutting down directory watcher
|
127 |
+
2024-08-23 16:25:55,671 INFO SenderThread:11284 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240823_162543-eroprw00/files
|
128 |
+
2024-08-23 16:25:55,671 INFO SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/requirements.txt requirements.txt
|
129 |
+
2024-08-23 16:25:55,671 INFO SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/config.yaml config.yaml
|
130 |
+
2024-08-23 16:25:55,673 INFO SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/wandb-metadata.json wandb-metadata.json
|
131 |
+
2024-08-23 16:25:55,673 INFO SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/wandb-summary.json wandb-summary.json
|
132 |
+
2024-08-23 16:25:55,674 INFO SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/output.log output.log
|
133 |
+
2024-08-23 16:25:55,676 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 10
|
134 |
+
2024-08-23 16:25:55,676 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: poll_exit
|
135 |
+
2024-08-23 16:25:55,676 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
136 |
+
2024-08-23 16:25:55,677 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 10
|
137 |
+
2024-08-23 16:25:55,678 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
138 |
+
2024-08-23 16:25:55,678 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 10
|
139 |
+
2024-08-23 16:25:55,678 INFO SenderThread:11284 [file_pusher.py:finish():172] shutting down file pusher
|
140 |
+
2024-08-23 16:25:56,071 INFO wandb-upload_0:11284 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162543-eroprw00/files/requirements.txt
|
141 |
+
2024-08-23 16:25:56,117 INFO wandb-upload_1:11284 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162543-eroprw00/files/config.yaml
|
142 |
+
2024-08-23 16:25:56,151 INFO wandb-upload_3:11284 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162543-eroprw00/files/output.log
|
143 |
+
2024-08-23 16:25:56,152 INFO wandb-upload_2:11284 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162543-eroprw00/files/wandb-summary.json
|
144 |
+
2024-08-23 16:25:56,353 INFO Thread-11 (_thread_body):11284 [sender.py:transition_state():617] send defer: 11
|
145 |
+
2024-08-23 16:25:56,353 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
146 |
+
2024-08-23 16:25:56,353 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 11
|
147 |
+
2024-08-23 16:25:56,353 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
148 |
+
2024-08-23 16:25:56,353 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 11
|
149 |
+
2024-08-23 16:25:56,353 INFO SenderThread:11284 [file_pusher.py:join():178] waiting for file pusher
|
150 |
+
2024-08-23 16:25:56,353 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 12
|
151 |
+
2024-08-23 16:25:56,354 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
152 |
+
2024-08-23 16:25:56,354 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 12
|
153 |
+
2024-08-23 16:25:56,354 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
154 |
+
2024-08-23 16:25:56,354 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 12
|
155 |
+
2024-08-23 16:25:56,354 INFO SenderThread:11284 [file_stream.py:finish():595] file stream finish called
|
156 |
+
2024-08-23 16:25:56,522 INFO SenderThread:11284 [file_stream.py:finish():599] file stream finish is done
|
157 |
+
2024-08-23 16:25:56,522 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 13
|
158 |
+
2024-08-23 16:25:56,523 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
159 |
+
2024-08-23 16:25:56,523 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 13
|
160 |
+
2024-08-23 16:25:56,523 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
161 |
+
2024-08-23 16:25:56,523 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 13
|
162 |
+
2024-08-23 16:25:56,523 INFO SenderThread:11284 [sender.py:transition_state():617] send defer: 14
|
163 |
+
2024-08-23 16:25:56,523 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
|
164 |
+
2024-08-23 16:25:56,523 DEBUG SenderThread:11284 [sender.py:send():382] send: final
|
165 |
+
2024-08-23 16:25:56,523 INFO HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 14
|
166 |
+
2024-08-23 16:25:56,523 DEBUG SenderThread:11284 [sender.py:send():382] send: footer
|
167 |
+
2024-08-23 16:25:56,524 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: defer
|
168 |
+
2024-08-23 16:25:56,524 INFO SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 14
|
169 |
+
2024-08-23 16:25:56,524 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: poll_exit
|
170 |
+
2024-08-23 16:25:56,524 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: poll_exit
|
171 |
+
2024-08-23 16:25:56,524 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: poll_exit
|
172 |
+
2024-08-23 16:25:56,525 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: poll_exit
|
173 |
+
2024-08-23 16:25:56,525 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: server_info
|
174 |
+
2024-08-23 16:25:56,525 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: get_summary
|
175 |
+
2024-08-23 16:25:56,525 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: server_info
|
176 |
+
2024-08-23 16:25:56,527 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: sampled_history
|
177 |
+
2024-08-23 16:25:56,527 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: internal_messages
|
178 |
+
2024-08-23 16:25:56,527 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: job_info
|
179 |
+
2024-08-23 16:25:56,684 DEBUG SenderThread:11284 [sender.py:send_request():409] send_request: job_info
|
180 |
+
2024-08-23 16:25:56,684 INFO MainThread:11284 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
181 |
+
2024-08-23 16:25:56,685 INFO MainThread:11284 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
182 |
+
2024-08-23 16:25:56,685 INFO MainThread:11284 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
183 |
+
2024-08-23 16:25:56,685 DEBUG HandlerThread:11284 [handler.py:handle_request():146] handle_request: shutdown
|
184 |
+
2024-08-23 16:25:56,685 INFO HandlerThread:11284 [handler.py:finish():869] shutting down handler
|
185 |
+
2024-08-23 16:25:57,528 INFO WriterThread:11284 [datastore.py:close():296] close: /project/wandb/run-20240823_162543-eroprw00/run-eroprw00.wandb
|
186 |
+
2024-08-23 16:25:57,685 INFO SenderThread:11284 [sender.py:finish():1572] shutting down sender
|
187 |
+
2024-08-23 16:25:57,685 INFO SenderThread:11284 [file_pusher.py:finish():172] shutting down file pusher
|
188 |
+
2024-08-23 16:25:57,685 INFO SenderThread:11284 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240823_162543-eroprw00/logs/debug.log
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
2 |
+
2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Configure stats pid to 11213
|
3 |
+
2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
4 |
+
2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
5 |
+
2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
|
6 |
+
2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
8 |
+
2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240823_162543-eroprw00/logs/debug.log
|
9 |
+
2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240823_162543-eroprw00/logs/debug-internal.log
|
10 |
+
2024-08-23 16:25:43,196 INFO MainThread:11213 [wandb_init.py:init():566] calling init triggers
|
11 |
+
2024-08-23 16:25:43,197 INFO MainThread:11213 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'test_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'Qwen2-0.5b-0.2_train_2024-08-23-16:25:30', 'wandb_project': 'llm_tutorial-0.2', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'save': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 7500, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 7500, 'train_samples': None, 'global_batch_size': 640, 'micro_batch_size': 5, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 131072, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/Qwen2-0.5b-0.2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 128}
|
13 |
+
2024-08-23 16:25:43,197 INFO MainThread:11213 [wandb_init.py:init():616] starting backend
|
14 |
+
2024-08-23 16:25:43,197 INFO MainThread:11213 [wandb_init.py:init():620] setting up manager
|
15 |
+
2024-08-23 16:25:43,201 INFO MainThread:11213 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-08-23 16:25:43,202 INFO MainThread:11213 [wandb_init.py:init():628] backend started and connected
|
17 |
+
2024-08-23 16:25:43,207 INFO MainThread:11213 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-08-23 16:25:43,218 INFO MainThread:11213 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-08-23 16:25:43,667 INFO MainThread:11213 [wandb_run.py:_on_init():2262] communicating current version
|
20 |
+
2024-08-23 16:25:43,692 INFO MainThread:11213 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-08-23 16:25:43,692 INFO MainThread:11213 [wandb_init.py:init():804] starting run threads in backend
|
23 |
+
2024-08-23 16:25:43,790 INFO MainThread:11213 [wandb_run.py:_console_start():2241] atexit reg
|
24 |
+
2024-08-23 16:25:43,790 INFO MainThread:11213 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
25 |
+
2024-08-23 16:25:43,790 INFO MainThread:11213 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
26 |
+
2024-08-23 16:25:43,790 INFO MainThread:11213 [wandb_run.py:_redirect():2186] Redirects installed.
|
27 |
+
2024-08-23 16:25:43,791 INFO MainThread:11213 [wandb_init.py:init():847] run started, returning control to user process
|
28 |
+
2024-08-23 16:25:51,139 INFO MainThread:11213 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
|
29 |
+
2024-08-23 16:25:51,139 INFO MainThread:11213 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
30 |
+
2024-08-23 16:25:57,685 WARNING MsgRouterThr:11213 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240823_162543-eroprw00/run-eroprw00.wandb
ADDED
Binary file (18.1 kB). View file
|
|