m4lw4r3exe commited on
Commit
ddf00f9
·
1 Parent(s): 371f79e

Upload with huggingface_hub

Browse files
Files changed (46) hide show
  1. checkpoint-32768/config.json +33 -0
  2. checkpoint-32768/optimizer.pt +3 -0
  3. checkpoint-32768/pytorch_model.bin +3 -0
  4. checkpoint-32768/rng_state.pth +3 -0
  5. checkpoint-32768/scaler.pt +3 -0
  6. checkpoint-32768/scheduler.pt +3 -0
  7. checkpoint-32768/trainer_state.json +128 -0
  8. checkpoint-32768/training_args.bin +3 -0
  9. checkpoint-49152/config.json +33 -0
  10. checkpoint-49152/optimizer.pt +3 -0
  11. checkpoint-49152/pytorch_model.bin +3 -0
  12. checkpoint-49152/rng_state.pth +3 -0
  13. checkpoint-49152/scaler.pt +3 -0
  14. checkpoint-49152/scheduler.pt +3 -0
  15. checkpoint-49152/trainer_state.json +184 -0
  16. checkpoint-49152/training_args.bin +3 -0
  17. checkpoint-65536/config.json +33 -0
  18. checkpoint-65536/optimizer.pt +3 -0
  19. checkpoint-65536/pytorch_model.bin +3 -0
  20. checkpoint-65536/rng_state.pth +3 -0
  21. checkpoint-65536/scaler.pt +3 -0
  22. checkpoint-65536/scheduler.pt +3 -0
  23. checkpoint-65536/trainer_state.json +240 -0
  24. checkpoint-65536/training_args.bin +3 -0
  25. checkpoint-81920/config.json +33 -0
  26. checkpoint-81920/optimizer.pt +3 -0
  27. checkpoint-81920/pytorch_model.bin +3 -0
  28. checkpoint-81920/rng_state.pth +3 -0
  29. checkpoint-81920/scaler.pt +3 -0
  30. checkpoint-81920/scheduler.pt +3 -0
  31. checkpoint-81920/trainer_state.json +296 -0
  32. checkpoint-81920/training_args.bin +3 -0
  33. checkpoint-98304/config.json +33 -0
  34. checkpoint-98304/optimizer.pt +3 -0
  35. checkpoint-98304/pytorch_model.bin +3 -0
  36. checkpoint-98304/rng_state.pth +3 -0
  37. checkpoint-98304/scaler.pt +3 -0
  38. checkpoint-98304/scheduler.pt +3 -0
  39. checkpoint-98304/trainer_state.json +352 -0
  40. checkpoint-98304/training_args.bin +3 -0
  41. config.json +1 -0
  42. pytorch_model.bin +1 -1
  43. tokenizer_config.json +2 -0
  44. trainer_state.json +6 -240
  45. training_args.bin +2 -2
  46. training_args.json +109 -0
checkpoint-32768/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "JammyMachina/elec-gmusic-familized-model-13-12__17-35-53",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 512,
15
+ "n_head": 8,
16
+ "n_inner": null,
17
+ "n_layer": 6,
18
+ "n_positions": 2048,
19
+ "pad_token_id": 1,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.26.0.dev0",
31
+ "use_cache": true,
32
+ "vocab_size": 301
33
+ }
checkpoint-32768/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9add9f9c673fb9dacc6d360c670bd07152587baa24856badfb2669592cda79b2
3
+ size 160988613
checkpoint-32768/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c57b09875ca8fb416a4c7eaf39ed7657e42a854e2a7794bf2eb8670be7bc636
3
+ size 105666297
checkpoint-32768/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0e846988878ee392ba95856b0d816bedc58b53e8be2f837eee01d874a56290c
3
+ size 15597
checkpoint-32768/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0476bbc1d998a71cdb7a2d1d684f9164a4046fa04ed4b9a5fed728f44b838df7
3
+ size 557
checkpoint-32768/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fd76926d9e4e10aa4dab849e57a90218cb4d2299731d1ea61125621a3e80c55
3
+ size 627
checkpoint-32768/trainer_state.json ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.7906010928961749,
5
+ "global_step": 32768,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.22,
12
+ "learning_rate": 0.0004984426897459585,
13
+ "loss": 0.5454,
14
+ "step": 4096
15
+ },
16
+ {
17
+ "epoch": 0.22,
18
+ "eval_loss": 0.5005695223808289,
19
+ "eval_runtime": 32.9622,
20
+ "eval_samples_per_second": 104.271,
21
+ "eval_steps_per_second": 6.523,
22
+ "step": 4096
23
+ },
24
+ {
25
+ "epoch": 0.45,
26
+ "learning_rate": 0.0004934687023955681,
27
+ "loss": 0.4849,
28
+ "step": 8192
29
+ },
30
+ {
31
+ "epoch": 0.45,
32
+ "eval_loss": 0.480915367603302,
33
+ "eval_runtime": 32.9785,
34
+ "eval_samples_per_second": 104.219,
35
+ "eval_steps_per_second": 6.519,
36
+ "step": 8192
37
+ },
38
+ {
39
+ "epoch": 0.67,
40
+ "learning_rate": 0.000485144849673373,
41
+ "loss": 0.4518,
42
+ "step": 12288
43
+ },
44
+ {
45
+ "epoch": 0.67,
46
+ "eval_loss": 0.46660953760147095,
47
+ "eval_runtime": 32.8208,
48
+ "eval_samples_per_second": 104.72,
49
+ "eval_steps_per_second": 6.551,
50
+ "step": 12288
51
+ },
52
+ {
53
+ "epoch": 0.9,
54
+ "learning_rate": 0.0004735848873631612,
55
+ "loss": 0.4282,
56
+ "step": 16384
57
+ },
58
+ {
59
+ "epoch": 0.9,
60
+ "eval_loss": 0.46097490191459656,
61
+ "eval_runtime": 32.8311,
62
+ "eval_samples_per_second": 104.687,
63
+ "eval_steps_per_second": 6.549,
64
+ "step": 16384
65
+ },
66
+ {
67
+ "epoch": 1.12,
68
+ "learning_rate": 0.0004589518403420676,
69
+ "loss": 0.4145,
70
+ "step": 20480
71
+ },
72
+ {
73
+ "epoch": 1.12,
74
+ "eval_loss": 0.45063599944114685,
75
+ "eval_runtime": 32.8912,
76
+ "eval_samples_per_second": 104.496,
77
+ "eval_steps_per_second": 6.537,
78
+ "step": 20480
79
+ },
80
+ {
81
+ "epoch": 1.34,
82
+ "learning_rate": 0.0004414445597486605,
83
+ "loss": 0.399,
84
+ "step": 24576
85
+ },
86
+ {
87
+ "epoch": 1.34,
88
+ "eval_loss": 0.44468095898628235,
89
+ "eval_runtime": 32.706,
90
+ "eval_samples_per_second": 105.088,
91
+ "eval_steps_per_second": 6.574,
92
+ "step": 24576
93
+ },
94
+ {
95
+ "epoch": 1.57,
96
+ "learning_rate": 0.00042130386669061293,
97
+ "loss": 0.3882,
98
+ "step": 28672
99
+ },
100
+ {
101
+ "epoch": 1.57,
102
+ "eval_loss": 0.44857361912727356,
103
+ "eval_runtime": 32.9754,
104
+ "eval_samples_per_second": 104.229,
105
+ "eval_steps_per_second": 6.52,
106
+ "step": 28672
107
+ },
108
+ {
109
+ "epoch": 1.79,
110
+ "learning_rate": 0.0003988010477498867,
111
+ "loss": 0.3767,
112
+ "step": 32768
113
+ },
114
+ {
115
+ "epoch": 1.79,
116
+ "eval_loss": 0.44354742765426636,
117
+ "eval_runtime": 32.8219,
118
+ "eval_samples_per_second": 104.717,
119
+ "eval_steps_per_second": 6.55,
120
+ "step": 32768
121
+ }
122
+ ],
123
+ "max_steps": 109800,
124
+ "num_train_epochs": 6,
125
+ "total_flos": 1.5232584290933146e+17,
126
+ "trial_name": null,
127
+ "trial_params": null
128
+ }
checkpoint-32768/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:908dc25e857dbdefbfe009b9ed0b992ce31760b393bf814f450b0d3c701c2397
3
+ size 3579
checkpoint-49152/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "JammyMachina/elec-gmusic-familized-model-13-12__17-35-53",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 512,
15
+ "n_head": 8,
16
+ "n_inner": null,
17
+ "n_layer": 6,
18
+ "n_positions": 2048,
19
+ "pad_token_id": 1,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.26.0.dev0",
31
+ "use_cache": true,
32
+ "vocab_size": 301
33
+ }
checkpoint-49152/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:001f9615434776e6d4aa94c07af807eb5043c9673f22535b2a03f97e0c0f5825
3
+ size 160988613
checkpoint-49152/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9599fd4f855a04ce9eb096b3f6cec5e9e47d98d14c7fa1e905384d6e4b6d3cf7
3
+ size 105666297
checkpoint-49152/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a6950114eefe2f94bfc94c66035c28b77f269c1e701f22b38469ba376d43750
3
+ size 15597
checkpoint-49152/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af00b4a1d6f958e9aac32b8a85f495a006ab2769eab01b4b45246feb14c65a8f
3
+ size 557
checkpoint-49152/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d00168f4f79815bc924316879bd4fe7a203e69420404b9075ec9f83c377b6704
3
+ size 627
checkpoint-49152/trainer_state.json ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.6859016393442623,
5
+ "global_step": 49152,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.22,
12
+ "learning_rate": 0.0004984426897459585,
13
+ "loss": 0.5454,
14
+ "step": 4096
15
+ },
16
+ {
17
+ "epoch": 0.22,
18
+ "eval_loss": 0.5005695223808289,
19
+ "eval_runtime": 32.9622,
20
+ "eval_samples_per_second": 104.271,
21
+ "eval_steps_per_second": 6.523,
22
+ "step": 4096
23
+ },
24
+ {
25
+ "epoch": 0.45,
26
+ "learning_rate": 0.0004934687023955681,
27
+ "loss": 0.4849,
28
+ "step": 8192
29
+ },
30
+ {
31
+ "epoch": 0.45,
32
+ "eval_loss": 0.480915367603302,
33
+ "eval_runtime": 32.9785,
34
+ "eval_samples_per_second": 104.219,
35
+ "eval_steps_per_second": 6.519,
36
+ "step": 8192
37
+ },
38
+ {
39
+ "epoch": 0.67,
40
+ "learning_rate": 0.000485144849673373,
41
+ "loss": 0.4518,
42
+ "step": 12288
43
+ },
44
+ {
45
+ "epoch": 0.67,
46
+ "eval_loss": 0.46660953760147095,
47
+ "eval_runtime": 32.8208,
48
+ "eval_samples_per_second": 104.72,
49
+ "eval_steps_per_second": 6.551,
50
+ "step": 12288
51
+ },
52
+ {
53
+ "epoch": 0.9,
54
+ "learning_rate": 0.0004735848873631612,
55
+ "loss": 0.4282,
56
+ "step": 16384
57
+ },
58
+ {
59
+ "epoch": 0.9,
60
+ "eval_loss": 0.46097490191459656,
61
+ "eval_runtime": 32.8311,
62
+ "eval_samples_per_second": 104.687,
63
+ "eval_steps_per_second": 6.549,
64
+ "step": 16384
65
+ },
66
+ {
67
+ "epoch": 1.12,
68
+ "learning_rate": 0.0004589518403420676,
69
+ "loss": 0.4145,
70
+ "step": 20480
71
+ },
72
+ {
73
+ "epoch": 1.12,
74
+ "eval_loss": 0.45063599944114685,
75
+ "eval_runtime": 32.8912,
76
+ "eval_samples_per_second": 104.496,
77
+ "eval_steps_per_second": 6.537,
78
+ "step": 20480
79
+ },
80
+ {
81
+ "epoch": 1.34,
82
+ "learning_rate": 0.0004414445597486605,
83
+ "loss": 0.399,
84
+ "step": 24576
85
+ },
86
+ {
87
+ "epoch": 1.34,
88
+ "eval_loss": 0.44468095898628235,
89
+ "eval_runtime": 32.706,
90
+ "eval_samples_per_second": 105.088,
91
+ "eval_steps_per_second": 6.574,
92
+ "step": 24576
93
+ },
94
+ {
95
+ "epoch": 1.57,
96
+ "learning_rate": 0.00042130386669061293,
97
+ "loss": 0.3882,
98
+ "step": 28672
99
+ },
100
+ {
101
+ "epoch": 1.57,
102
+ "eval_loss": 0.44857361912727356,
103
+ "eval_runtime": 32.9754,
104
+ "eval_samples_per_second": 104.229,
105
+ "eval_steps_per_second": 6.52,
106
+ "step": 28672
107
+ },
108
+ {
109
+ "epoch": 1.79,
110
+ "learning_rate": 0.0003988010477498867,
111
+ "loss": 0.3767,
112
+ "step": 32768
113
+ },
114
+ {
115
+ "epoch": 1.79,
116
+ "eval_loss": 0.44354742765426636,
117
+ "eval_runtime": 32.8219,
118
+ "eval_samples_per_second": 104.717,
119
+ "eval_steps_per_second": 6.55,
120
+ "step": 32768
121
+ },
122
+ {
123
+ "epoch": 2.01,
124
+ "learning_rate": 0.0003742566178542921,
125
+ "loss": 0.3676,
126
+ "step": 36864
127
+ },
128
+ {
129
+ "epoch": 2.01,
130
+ "eval_loss": 0.43940743803977966,
131
+ "eval_runtime": 32.9279,
132
+ "eval_samples_per_second": 104.38,
133
+ "eval_steps_per_second": 6.529,
134
+ "step": 36864
135
+ },
136
+ {
137
+ "epoch": 2.24,
138
+ "learning_rate": 0.0003479963856008823,
139
+ "loss": 0.3577,
140
+ "step": 40960
141
+ },
142
+ {
143
+ "epoch": 2.24,
144
+ "eval_loss": 0.4323909878730774,
145
+ "eval_runtime": 33.0718,
146
+ "eval_samples_per_second": 103.926,
147
+ "eval_steps_per_second": 6.501,
148
+ "step": 40960
149
+ },
150
+ {
151
+ "epoch": 2.46,
152
+ "learning_rate": 0.0003203943839704654,
153
+ "loss": 0.3517,
154
+ "step": 45056
155
+ },
156
+ {
157
+ "epoch": 2.46,
158
+ "eval_loss": 0.43262797594070435,
159
+ "eval_runtime": 32.8978,
160
+ "eval_samples_per_second": 104.475,
161
+ "eval_steps_per_second": 6.535,
162
+ "step": 45056
163
+ },
164
+ {
165
+ "epoch": 2.69,
166
+ "learning_rate": 0.000291817008494138,
167
+ "loss": 0.3427,
168
+ "step": 49152
169
+ },
170
+ {
171
+ "epoch": 2.69,
172
+ "eval_loss": 0.4303751289844513,
173
+ "eval_runtime": 32.7306,
174
+ "eval_samples_per_second": 105.009,
175
+ "eval_steps_per_second": 6.569,
176
+ "step": 49152
177
+ }
178
+ ],
179
+ "max_steps": 109800,
180
+ "num_train_epochs": 6,
181
+ "total_flos": 2.28488651552981e+17,
182
+ "trial_name": null,
183
+ "trial_params": null
184
+ }
checkpoint-49152/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:908dc25e857dbdefbfe009b9ed0b992ce31760b393bf814f450b0d3c701c2397
3
+ size 3579
checkpoint-65536/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "JammyMachina/elec-gmusic-familized-model-13-12__17-35-53",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 512,
15
+ "n_head": 8,
16
+ "n_inner": null,
17
+ "n_layer": 6,
18
+ "n_positions": 2048,
19
+ "pad_token_id": 1,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.26.0.dev0",
31
+ "use_cache": true,
32
+ "vocab_size": 301
33
+ }
checkpoint-65536/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54e0920bdfddaabac8a5ad21d51caeec5cd54da9f757ebcddd88727e65e3b0fa
3
+ size 160988613
checkpoint-65536/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f332d8ba3274b68f5a8be51675fb17c52517bb91afa146b216152fe7b79e3ec
3
+ size 105666297
checkpoint-65536/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aee57e1a8e7a5046bc0aefde8c83d8da10e21f99a4041fe35e9fa626d36ad6df
3
+ size 15597
checkpoint-65536/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90eab177fcd8a8c5027f1aae9016833d3712bdfd8d1ad47bc6118d56bd80b1e2
3
+ size 557
checkpoint-65536/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21d15ba8d37dc81eb86a1c0e1d8d668e69692d75bf075e2f21975aeca452668a
3
+ size 627
checkpoint-65536/trainer_state.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.5812021857923497,
5
+ "global_step": 65536,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.22,
12
+ "learning_rate": 0.0004984426897459585,
13
+ "loss": 0.5454,
14
+ "step": 4096
15
+ },
16
+ {
17
+ "epoch": 0.22,
18
+ "eval_loss": 0.5005695223808289,
19
+ "eval_runtime": 32.9622,
20
+ "eval_samples_per_second": 104.271,
21
+ "eval_steps_per_second": 6.523,
22
+ "step": 4096
23
+ },
24
+ {
25
+ "epoch": 0.45,
26
+ "learning_rate": 0.0004934687023955681,
27
+ "loss": 0.4849,
28
+ "step": 8192
29
+ },
30
+ {
31
+ "epoch": 0.45,
32
+ "eval_loss": 0.480915367603302,
33
+ "eval_runtime": 32.9785,
34
+ "eval_samples_per_second": 104.219,
35
+ "eval_steps_per_second": 6.519,
36
+ "step": 8192
37
+ },
38
+ {
39
+ "epoch": 0.67,
40
+ "learning_rate": 0.000485144849673373,
41
+ "loss": 0.4518,
42
+ "step": 12288
43
+ },
44
+ {
45
+ "epoch": 0.67,
46
+ "eval_loss": 0.46660953760147095,
47
+ "eval_runtime": 32.8208,
48
+ "eval_samples_per_second": 104.72,
49
+ "eval_steps_per_second": 6.551,
50
+ "step": 12288
51
+ },
52
+ {
53
+ "epoch": 0.9,
54
+ "learning_rate": 0.0004735848873631612,
55
+ "loss": 0.4282,
56
+ "step": 16384
57
+ },
58
+ {
59
+ "epoch": 0.9,
60
+ "eval_loss": 0.46097490191459656,
61
+ "eval_runtime": 32.8311,
62
+ "eval_samples_per_second": 104.687,
63
+ "eval_steps_per_second": 6.549,
64
+ "step": 16384
65
+ },
66
+ {
67
+ "epoch": 1.12,
68
+ "learning_rate": 0.0004589518403420676,
69
+ "loss": 0.4145,
70
+ "step": 20480
71
+ },
72
+ {
73
+ "epoch": 1.12,
74
+ "eval_loss": 0.45063599944114685,
75
+ "eval_runtime": 32.8912,
76
+ "eval_samples_per_second": 104.496,
77
+ "eval_steps_per_second": 6.537,
78
+ "step": 20480
79
+ },
80
+ {
81
+ "epoch": 1.34,
82
+ "learning_rate": 0.0004414445597486605,
83
+ "loss": 0.399,
84
+ "step": 24576
85
+ },
86
+ {
87
+ "epoch": 1.34,
88
+ "eval_loss": 0.44468095898628235,
89
+ "eval_runtime": 32.706,
90
+ "eval_samples_per_second": 105.088,
91
+ "eval_steps_per_second": 6.574,
92
+ "step": 24576
93
+ },
94
+ {
95
+ "epoch": 1.57,
96
+ "learning_rate": 0.00042130386669061293,
97
+ "loss": 0.3882,
98
+ "step": 28672
99
+ },
100
+ {
101
+ "epoch": 1.57,
102
+ "eval_loss": 0.44857361912727356,
103
+ "eval_runtime": 32.9754,
104
+ "eval_samples_per_second": 104.229,
105
+ "eval_steps_per_second": 6.52,
106
+ "step": 28672
107
+ },
108
+ {
109
+ "epoch": 1.79,
110
+ "learning_rate": 0.0003988010477498867,
111
+ "loss": 0.3767,
112
+ "step": 32768
113
+ },
114
+ {
115
+ "epoch": 1.79,
116
+ "eval_loss": 0.44354742765426636,
117
+ "eval_runtime": 32.8219,
118
+ "eval_samples_per_second": 104.717,
119
+ "eval_steps_per_second": 6.55,
120
+ "step": 32768
121
+ },
122
+ {
123
+ "epoch": 2.01,
124
+ "learning_rate": 0.0003742566178542921,
125
+ "loss": 0.3676,
126
+ "step": 36864
127
+ },
128
+ {
129
+ "epoch": 2.01,
130
+ "eval_loss": 0.43940743803977966,
131
+ "eval_runtime": 32.9279,
132
+ "eval_samples_per_second": 104.38,
133
+ "eval_steps_per_second": 6.529,
134
+ "step": 36864
135
+ },
136
+ {
137
+ "epoch": 2.24,
138
+ "learning_rate": 0.0003479963856008823,
139
+ "loss": 0.3577,
140
+ "step": 40960
141
+ },
142
+ {
143
+ "epoch": 2.24,
144
+ "eval_loss": 0.4323909878730774,
145
+ "eval_runtime": 33.0718,
146
+ "eval_samples_per_second": 103.926,
147
+ "eval_steps_per_second": 6.501,
148
+ "step": 40960
149
+ },
150
+ {
151
+ "epoch": 2.46,
152
+ "learning_rate": 0.0003203943839704654,
153
+ "loss": 0.3517,
154
+ "step": 45056
155
+ },
156
+ {
157
+ "epoch": 2.46,
158
+ "eval_loss": 0.43262797594070435,
159
+ "eval_runtime": 32.8978,
160
+ "eval_samples_per_second": 104.475,
161
+ "eval_steps_per_second": 6.535,
162
+ "step": 45056
163
+ },
164
+ {
165
+ "epoch": 2.69,
166
+ "learning_rate": 0.000291817008494138,
167
+ "loss": 0.3427,
168
+ "step": 49152
169
+ },
170
+ {
171
+ "epoch": 2.69,
172
+ "eval_loss": 0.4303751289844513,
173
+ "eval_runtime": 32.7306,
174
+ "eval_samples_per_second": 105.009,
175
+ "eval_steps_per_second": 6.569,
176
+ "step": 49152
177
+ },
178
+ {
179
+ "epoch": 2.91,
180
+ "learning_rate": 0.0002626784527987937,
181
+ "loss": 0.3344,
182
+ "step": 53248
183
+ },
184
+ {
185
+ "epoch": 2.91,
186
+ "eval_loss": 0.4269418716430664,
187
+ "eval_runtime": 32.8164,
188
+ "eval_samples_per_second": 104.734,
189
+ "eval_steps_per_second": 6.552,
190
+ "step": 53248
191
+ },
192
+ {
193
+ "epoch": 3.13,
194
+ "learning_rate": 0.00023335128365042213,
195
+ "loss": 0.3242,
196
+ "step": 57344
197
+ },
198
+ {
199
+ "epoch": 3.13,
200
+ "eval_loss": 0.4315994679927826,
201
+ "eval_runtime": 32.82,
202
+ "eval_samples_per_second": 104.723,
203
+ "eval_steps_per_second": 6.551,
204
+ "step": 57344
205
+ },
206
+ {
207
+ "epoch": 3.36,
208
+ "learning_rate": 0.00020425323740515426,
209
+ "loss": 0.3163,
210
+ "step": 61440
211
+ },
212
+ {
213
+ "epoch": 3.36,
214
+ "eval_loss": 0.418425053358078,
215
+ "eval_runtime": 32.815,
216
+ "eval_samples_per_second": 104.739,
217
+ "eval_steps_per_second": 6.552,
218
+ "step": 61440
219
+ },
220
+ {
221
+ "epoch": 3.58,
222
+ "learning_rate": 0.0001757916101204877,
223
+ "loss": 0.3099,
224
+ "step": 65536
225
+ },
226
+ {
227
+ "epoch": 3.58,
228
+ "eval_loss": 0.42424410581588745,
229
+ "eval_runtime": 32.8877,
230
+ "eval_samples_per_second": 104.507,
231
+ "eval_steps_per_second": 6.537,
232
+ "step": 65536
233
+ }
234
+ ],
235
+ "max_steps": 109800,
236
+ "num_train_epochs": 6,
237
+ "total_flos": 3.04651444307755e+17,
238
+ "trial_name": null,
239
+ "trial_params": null
240
+ }
checkpoint-65536/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:908dc25e857dbdefbfe009b9ed0b992ce31760b393bf814f450b0d3c701c2397
3
+ size 3579
checkpoint-81920/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "JammyMachina/elec-gmusic-familized-model-13-12__17-35-53",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 512,
15
+ "n_head": 8,
16
+ "n_inner": null,
17
+ "n_layer": 6,
18
+ "n_positions": 2048,
19
+ "pad_token_id": 1,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.26.0.dev0",
31
+ "use_cache": true,
32
+ "vocab_size": 301
33
+ }
checkpoint-81920/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb7ab9bbb244c55a8ca5951e45289da72d312a6ad27c5901d57fb6561c6dc33b
3
+ size 160988741
checkpoint-81920/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc955d862610702341bec6b458ab0ea3b529ebc7b3ae659f9e19082059b9a035
3
+ size 105666297
checkpoint-81920/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fffffe398af585e9cd6ad8216a86bf956ee56132206e36e91b478413b19ff1c6
3
+ size 15597
checkpoint-81920/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a64898410c19ad90c047ca0716ad9cc322ded2ffe815cb5e76e91a927485c411
3
+ size 557
checkpoint-81920/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54666b8c2f968c26d6f8c387e7050becbe28e016974d1d1a3980d380af65ff37
3
+ size 627
checkpoint-81920/trainer_state.json ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.476502732240437,
5
+ "global_step": 81920,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.22,
12
+ "learning_rate": 0.0004984426897459585,
13
+ "loss": 0.5454,
14
+ "step": 4096
15
+ },
16
+ {
17
+ "epoch": 0.22,
18
+ "eval_loss": 0.5005695223808289,
19
+ "eval_runtime": 32.9622,
20
+ "eval_samples_per_second": 104.271,
21
+ "eval_steps_per_second": 6.523,
22
+ "step": 4096
23
+ },
24
+ {
25
+ "epoch": 0.45,
26
+ "learning_rate": 0.0004934687023955681,
27
+ "loss": 0.4849,
28
+ "step": 8192
29
+ },
30
+ {
31
+ "epoch": 0.45,
32
+ "eval_loss": 0.480915367603302,
33
+ "eval_runtime": 32.9785,
34
+ "eval_samples_per_second": 104.219,
35
+ "eval_steps_per_second": 6.519,
36
+ "step": 8192
37
+ },
38
+ {
39
+ "epoch": 0.67,
40
+ "learning_rate": 0.000485144849673373,
41
+ "loss": 0.4518,
42
+ "step": 12288
43
+ },
44
+ {
45
+ "epoch": 0.67,
46
+ "eval_loss": 0.46660953760147095,
47
+ "eval_runtime": 32.8208,
48
+ "eval_samples_per_second": 104.72,
49
+ "eval_steps_per_second": 6.551,
50
+ "step": 12288
51
+ },
52
+ {
53
+ "epoch": 0.9,
54
+ "learning_rate": 0.0004735848873631612,
55
+ "loss": 0.4282,
56
+ "step": 16384
57
+ },
58
+ {
59
+ "epoch": 0.9,
60
+ "eval_loss": 0.46097490191459656,
61
+ "eval_runtime": 32.8311,
62
+ "eval_samples_per_second": 104.687,
63
+ "eval_steps_per_second": 6.549,
64
+ "step": 16384
65
+ },
66
+ {
67
+ "epoch": 1.12,
68
+ "learning_rate": 0.0004589518403420676,
69
+ "loss": 0.4145,
70
+ "step": 20480
71
+ },
72
+ {
73
+ "epoch": 1.12,
74
+ "eval_loss": 0.45063599944114685,
75
+ "eval_runtime": 32.8912,
76
+ "eval_samples_per_second": 104.496,
77
+ "eval_steps_per_second": 6.537,
78
+ "step": 20480
79
+ },
80
+ {
81
+ "epoch": 1.34,
82
+ "learning_rate": 0.0004414445597486605,
83
+ "loss": 0.399,
84
+ "step": 24576
85
+ },
86
+ {
87
+ "epoch": 1.34,
88
+ "eval_loss": 0.44468095898628235,
89
+ "eval_runtime": 32.706,
90
+ "eval_samples_per_second": 105.088,
91
+ "eval_steps_per_second": 6.574,
92
+ "step": 24576
93
+ },
94
+ {
95
+ "epoch": 1.57,
96
+ "learning_rate": 0.00042130386669061293,
97
+ "loss": 0.3882,
98
+ "step": 28672
99
+ },
100
+ {
101
+ "epoch": 1.57,
102
+ "eval_loss": 0.44857361912727356,
103
+ "eval_runtime": 32.9754,
104
+ "eval_samples_per_second": 104.229,
105
+ "eval_steps_per_second": 6.52,
106
+ "step": 28672
107
+ },
108
+ {
109
+ "epoch": 1.79,
110
+ "learning_rate": 0.0003988010477498867,
111
+ "loss": 0.3767,
112
+ "step": 32768
113
+ },
114
+ {
115
+ "epoch": 1.79,
116
+ "eval_loss": 0.44354742765426636,
117
+ "eval_runtime": 32.8219,
118
+ "eval_samples_per_second": 104.717,
119
+ "eval_steps_per_second": 6.55,
120
+ "step": 32768
121
+ },
122
+ {
123
+ "epoch": 2.01,
124
+ "learning_rate": 0.0003742566178542921,
125
+ "loss": 0.3676,
126
+ "step": 36864
127
+ },
128
+ {
129
+ "epoch": 2.01,
130
+ "eval_loss": 0.43940743803977966,
131
+ "eval_runtime": 32.9279,
132
+ "eval_samples_per_second": 104.38,
133
+ "eval_steps_per_second": 6.529,
134
+ "step": 36864
135
+ },
136
+ {
137
+ "epoch": 2.24,
138
+ "learning_rate": 0.0003479963856008823,
139
+ "loss": 0.3577,
140
+ "step": 40960
141
+ },
142
+ {
143
+ "epoch": 2.24,
144
+ "eval_loss": 0.4323909878730774,
145
+ "eval_runtime": 33.0718,
146
+ "eval_samples_per_second": 103.926,
147
+ "eval_steps_per_second": 6.501,
148
+ "step": 40960
149
+ },
150
+ {
151
+ "epoch": 2.46,
152
+ "learning_rate": 0.0003203943839704654,
153
+ "loss": 0.3517,
154
+ "step": 45056
155
+ },
156
+ {
157
+ "epoch": 2.46,
158
+ "eval_loss": 0.43262797594070435,
159
+ "eval_runtime": 32.8978,
160
+ "eval_samples_per_second": 104.475,
161
+ "eval_steps_per_second": 6.535,
162
+ "step": 45056
163
+ },
164
+ {
165
+ "epoch": 2.69,
166
+ "learning_rate": 0.000291817008494138,
167
+ "loss": 0.3427,
168
+ "step": 49152
169
+ },
170
+ {
171
+ "epoch": 2.69,
172
+ "eval_loss": 0.4303751289844513,
173
+ "eval_runtime": 32.7306,
174
+ "eval_samples_per_second": 105.009,
175
+ "eval_steps_per_second": 6.569,
176
+ "step": 49152
177
+ },
178
+ {
179
+ "epoch": 2.91,
180
+ "learning_rate": 0.0002626784527987937,
181
+ "loss": 0.3344,
182
+ "step": 53248
183
+ },
184
+ {
185
+ "epoch": 2.91,
186
+ "eval_loss": 0.4269418716430664,
187
+ "eval_runtime": 32.8164,
188
+ "eval_samples_per_second": 104.734,
189
+ "eval_steps_per_second": 6.552,
190
+ "step": 53248
191
+ },
192
+ {
193
+ "epoch": 3.13,
194
+ "learning_rate": 0.00023335128365042213,
195
+ "loss": 0.3242,
196
+ "step": 57344
197
+ },
198
+ {
199
+ "epoch": 3.13,
200
+ "eval_loss": 0.4315994679927826,
201
+ "eval_runtime": 32.82,
202
+ "eval_samples_per_second": 104.723,
203
+ "eval_steps_per_second": 6.551,
204
+ "step": 57344
205
+ },
206
+ {
207
+ "epoch": 3.36,
208
+ "learning_rate": 0.00020425323740515426,
209
+ "loss": 0.3163,
210
+ "step": 61440
211
+ },
212
+ {
213
+ "epoch": 3.36,
214
+ "eval_loss": 0.418425053358078,
215
+ "eval_runtime": 32.815,
216
+ "eval_samples_per_second": 104.739,
217
+ "eval_steps_per_second": 6.552,
218
+ "step": 61440
219
+ },
220
+ {
221
+ "epoch": 3.58,
222
+ "learning_rate": 0.0001757916101204877,
223
+ "loss": 0.3099,
224
+ "step": 65536
225
+ },
226
+ {
227
+ "epoch": 3.58,
228
+ "eval_loss": 0.42424410581588745,
229
+ "eval_runtime": 32.8877,
230
+ "eval_samples_per_second": 104.507,
231
+ "eval_steps_per_second": 6.537,
232
+ "step": 65536
233
+ },
234
+ {
235
+ "epoch": 3.81,
236
+ "learning_rate": 0.00014835075511133623,
237
+ "loss": 0.3014,
238
+ "step": 69632
239
+ },
240
+ {
241
+ "epoch": 3.81,
242
+ "eval_loss": 0.4214063882827759,
243
+ "eval_runtime": 32.7528,
244
+ "eval_samples_per_second": 104.938,
245
+ "eval_steps_per_second": 6.564,
246
+ "step": 69632
247
+ },
248
+ {
249
+ "epoch": 4.03,
250
+ "learning_rate": 0.00012230813460340284,
251
+ "loss": 0.2922,
252
+ "step": 73728
253
+ },
254
+ {
255
+ "epoch": 4.03,
256
+ "eval_loss": 0.41899624466896057,
257
+ "eval_runtime": 32.874,
258
+ "eval_samples_per_second": 104.551,
259
+ "eval_steps_per_second": 6.54,
260
+ "step": 73728
261
+ },
262
+ {
263
+ "epoch": 4.25,
264
+ "learning_rate": 9.802197743091277e-05,
265
+ "loss": 0.2824,
266
+ "step": 77824
267
+ },
268
+ {
269
+ "epoch": 4.25,
270
+ "eval_loss": 0.4261378347873688,
271
+ "eval_runtime": 32.9867,
272
+ "eval_samples_per_second": 104.193,
273
+ "eval_steps_per_second": 6.518,
274
+ "step": 77824
275
+ },
276
+ {
277
+ "epoch": 4.48,
278
+ "learning_rate": 7.582635142582842e-05,
279
+ "loss": 0.2763,
280
+ "step": 81920
281
+ },
282
+ {
283
+ "epoch": 4.48,
284
+ "eval_loss": 0.41689732670783997,
285
+ "eval_runtime": 32.8378,
286
+ "eval_samples_per_second": 104.666,
287
+ "eval_steps_per_second": 6.547,
288
+ "step": 81920
289
+ }
290
+ ],
291
+ "max_steps": 109800,
292
+ "num_train_epochs": 6,
293
+ "total_flos": 3.808142393323684e+17,
294
+ "trial_name": null,
295
+ "trial_params": null
296
+ }
checkpoint-81920/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:908dc25e857dbdefbfe009b9ed0b992ce31760b393bf814f450b0d3c701c2397
3
+ size 3579
checkpoint-98304/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "JammyMachina/elec-gmusic-familized-model-13-12__17-35-53",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 512,
15
+ "n_head": 8,
16
+ "n_inner": null,
17
+ "n_layer": 6,
18
+ "n_positions": 2048,
19
+ "pad_token_id": 1,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.26.0.dev0",
31
+ "use_cache": true,
32
+ "vocab_size": 301
33
+ }
checkpoint-98304/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:014913b3bebc5949772972bad9abd8cc1441a32ccaaf6ce6e39aa7ee451e91c2
3
+ size 160988741
checkpoint-98304/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21daa275d207208ec6719f04b377544f2493361f841335c2d95e2485bff9e8dc
3
+ size 105666297
checkpoint-98304/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:039e2c727e8a69005845078a1c0c187ee9f4ff546022459e2229559ea422dc92
3
+ size 15597
checkpoint-98304/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27e821a12d2ed8ac74718c23a701a5ce5698416f02e361928a1405fe12821f39
3
+ size 557
checkpoint-98304/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9ebba6536216c5fa93d6a817ad01a3044ab2fe46d5b6a237deafc08eb421d6d
3
+ size 627
checkpoint-98304/trainer_state.json ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.371803278688525,
5
+ "global_step": 98304,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.22,
12
+ "learning_rate": 0.0004984426897459585,
13
+ "loss": 0.5454,
14
+ "step": 4096
15
+ },
16
+ {
17
+ "epoch": 0.22,
18
+ "eval_loss": 0.5005695223808289,
19
+ "eval_runtime": 32.9622,
20
+ "eval_samples_per_second": 104.271,
21
+ "eval_steps_per_second": 6.523,
22
+ "step": 4096
23
+ },
24
+ {
25
+ "epoch": 0.45,
26
+ "learning_rate": 0.0004934687023955681,
27
+ "loss": 0.4849,
28
+ "step": 8192
29
+ },
30
+ {
31
+ "epoch": 0.45,
32
+ "eval_loss": 0.480915367603302,
33
+ "eval_runtime": 32.9785,
34
+ "eval_samples_per_second": 104.219,
35
+ "eval_steps_per_second": 6.519,
36
+ "step": 8192
37
+ },
38
+ {
39
+ "epoch": 0.67,
40
+ "learning_rate": 0.000485144849673373,
41
+ "loss": 0.4518,
42
+ "step": 12288
43
+ },
44
+ {
45
+ "epoch": 0.67,
46
+ "eval_loss": 0.46660953760147095,
47
+ "eval_runtime": 32.8208,
48
+ "eval_samples_per_second": 104.72,
49
+ "eval_steps_per_second": 6.551,
50
+ "step": 12288
51
+ },
52
+ {
53
+ "epoch": 0.9,
54
+ "learning_rate": 0.0004735848873631612,
55
+ "loss": 0.4282,
56
+ "step": 16384
57
+ },
58
+ {
59
+ "epoch": 0.9,
60
+ "eval_loss": 0.46097490191459656,
61
+ "eval_runtime": 32.8311,
62
+ "eval_samples_per_second": 104.687,
63
+ "eval_steps_per_second": 6.549,
64
+ "step": 16384
65
+ },
66
+ {
67
+ "epoch": 1.12,
68
+ "learning_rate": 0.0004589518403420676,
69
+ "loss": 0.4145,
70
+ "step": 20480
71
+ },
72
+ {
73
+ "epoch": 1.12,
74
+ "eval_loss": 0.45063599944114685,
75
+ "eval_runtime": 32.8912,
76
+ "eval_samples_per_second": 104.496,
77
+ "eval_steps_per_second": 6.537,
78
+ "step": 20480
79
+ },
80
+ {
81
+ "epoch": 1.34,
82
+ "learning_rate": 0.0004414445597486605,
83
+ "loss": 0.399,
84
+ "step": 24576
85
+ },
86
+ {
87
+ "epoch": 1.34,
88
+ "eval_loss": 0.44468095898628235,
89
+ "eval_runtime": 32.706,
90
+ "eval_samples_per_second": 105.088,
91
+ "eval_steps_per_second": 6.574,
92
+ "step": 24576
93
+ },
94
+ {
95
+ "epoch": 1.57,
96
+ "learning_rate": 0.00042130386669061293,
97
+ "loss": 0.3882,
98
+ "step": 28672
99
+ },
100
+ {
101
+ "epoch": 1.57,
102
+ "eval_loss": 0.44857361912727356,
103
+ "eval_runtime": 32.9754,
104
+ "eval_samples_per_second": 104.229,
105
+ "eval_steps_per_second": 6.52,
106
+ "step": 28672
107
+ },
108
+ {
109
+ "epoch": 1.79,
110
+ "learning_rate": 0.0003988010477498867,
111
+ "loss": 0.3767,
112
+ "step": 32768
113
+ },
114
+ {
115
+ "epoch": 1.79,
116
+ "eval_loss": 0.44354742765426636,
117
+ "eval_runtime": 32.8219,
118
+ "eval_samples_per_second": 104.717,
119
+ "eval_steps_per_second": 6.55,
120
+ "step": 32768
121
+ },
122
+ {
123
+ "epoch": 2.01,
124
+ "learning_rate": 0.0003742566178542921,
125
+ "loss": 0.3676,
126
+ "step": 36864
127
+ },
128
+ {
129
+ "epoch": 2.01,
130
+ "eval_loss": 0.43940743803977966,
131
+ "eval_runtime": 32.9279,
132
+ "eval_samples_per_second": 104.38,
133
+ "eval_steps_per_second": 6.529,
134
+ "step": 36864
135
+ },
136
+ {
137
+ "epoch": 2.24,
138
+ "learning_rate": 0.0003479963856008823,
139
+ "loss": 0.3577,
140
+ "step": 40960
141
+ },
142
+ {
143
+ "epoch": 2.24,
144
+ "eval_loss": 0.4323909878730774,
145
+ "eval_runtime": 33.0718,
146
+ "eval_samples_per_second": 103.926,
147
+ "eval_steps_per_second": 6.501,
148
+ "step": 40960
149
+ },
150
+ {
151
+ "epoch": 2.46,
152
+ "learning_rate": 0.0003203943839704654,
153
+ "loss": 0.3517,
154
+ "step": 45056
155
+ },
156
+ {
157
+ "epoch": 2.46,
158
+ "eval_loss": 0.43262797594070435,
159
+ "eval_runtime": 32.8978,
160
+ "eval_samples_per_second": 104.475,
161
+ "eval_steps_per_second": 6.535,
162
+ "step": 45056
163
+ },
164
+ {
165
+ "epoch": 2.69,
166
+ "learning_rate": 0.000291817008494138,
167
+ "loss": 0.3427,
168
+ "step": 49152
169
+ },
170
+ {
171
+ "epoch": 2.69,
172
+ "eval_loss": 0.4303751289844513,
173
+ "eval_runtime": 32.7306,
174
+ "eval_samples_per_second": 105.009,
175
+ "eval_steps_per_second": 6.569,
176
+ "step": 49152
177
+ },
178
+ {
179
+ "epoch": 2.91,
180
+ "learning_rate": 0.0002626784527987937,
181
+ "loss": 0.3344,
182
+ "step": 53248
183
+ },
184
+ {
185
+ "epoch": 2.91,
186
+ "eval_loss": 0.4269418716430664,
187
+ "eval_runtime": 32.8164,
188
+ "eval_samples_per_second": 104.734,
189
+ "eval_steps_per_second": 6.552,
190
+ "step": 53248
191
+ },
192
+ {
193
+ "epoch": 3.13,
194
+ "learning_rate": 0.00023335128365042213,
195
+ "loss": 0.3242,
196
+ "step": 57344
197
+ },
198
+ {
199
+ "epoch": 3.13,
200
+ "eval_loss": 0.4315994679927826,
201
+ "eval_runtime": 32.82,
202
+ "eval_samples_per_second": 104.723,
203
+ "eval_steps_per_second": 6.551,
204
+ "step": 57344
205
+ },
206
+ {
207
+ "epoch": 3.36,
208
+ "learning_rate": 0.00020425323740515426,
209
+ "loss": 0.3163,
210
+ "step": 61440
211
+ },
212
+ {
213
+ "epoch": 3.36,
214
+ "eval_loss": 0.418425053358078,
215
+ "eval_runtime": 32.815,
216
+ "eval_samples_per_second": 104.739,
217
+ "eval_steps_per_second": 6.552,
218
+ "step": 61440
219
+ },
220
+ {
221
+ "epoch": 3.58,
222
+ "learning_rate": 0.0001757916101204877,
223
+ "loss": 0.3099,
224
+ "step": 65536
225
+ },
226
+ {
227
+ "epoch": 3.58,
228
+ "eval_loss": 0.42424410581588745,
229
+ "eval_runtime": 32.8877,
230
+ "eval_samples_per_second": 104.507,
231
+ "eval_steps_per_second": 6.537,
232
+ "step": 65536
233
+ },
234
+ {
235
+ "epoch": 3.81,
236
+ "learning_rate": 0.00014835075511133623,
237
+ "loss": 0.3014,
238
+ "step": 69632
239
+ },
240
+ {
241
+ "epoch": 3.81,
242
+ "eval_loss": 0.4214063882827759,
243
+ "eval_runtime": 32.7528,
244
+ "eval_samples_per_second": 104.938,
245
+ "eval_steps_per_second": 6.564,
246
+ "step": 69632
247
+ },
248
+ {
249
+ "epoch": 4.03,
250
+ "learning_rate": 0.00012230813460340284,
251
+ "loss": 0.2922,
252
+ "step": 73728
253
+ },
254
+ {
255
+ "epoch": 4.03,
256
+ "eval_loss": 0.41899624466896057,
257
+ "eval_runtime": 32.874,
258
+ "eval_samples_per_second": 104.551,
259
+ "eval_steps_per_second": 6.54,
260
+ "step": 73728
261
+ },
262
+ {
263
+ "epoch": 4.25,
264
+ "learning_rate": 9.802197743091277e-05,
265
+ "loss": 0.2824,
266
+ "step": 77824
267
+ },
268
+ {
269
+ "epoch": 4.25,
270
+ "eval_loss": 0.4261378347873688,
271
+ "eval_runtime": 32.9867,
272
+ "eval_samples_per_second": 104.193,
273
+ "eval_steps_per_second": 6.518,
274
+ "step": 77824
275
+ },
276
+ {
277
+ "epoch": 4.48,
278
+ "learning_rate": 7.582635142582842e-05,
279
+ "loss": 0.2763,
280
+ "step": 81920
281
+ },
282
+ {
283
+ "epoch": 4.48,
284
+ "eval_loss": 0.41689732670783997,
285
+ "eval_runtime": 32.8378,
286
+ "eval_samples_per_second": 104.666,
287
+ "eval_steps_per_second": 6.547,
288
+ "step": 81920
289
+ },
290
+ {
291
+ "epoch": 4.7,
292
+ "learning_rate": 5.602656815333096e-05,
293
+ "loss": 0.2705,
294
+ "step": 86016
295
+ },
296
+ {
297
+ "epoch": 4.7,
298
+ "eval_loss": 0.41673606634140015,
299
+ "eval_runtime": 32.9093,
300
+ "eval_samples_per_second": 104.439,
301
+ "eval_steps_per_second": 6.533,
302
+ "step": 86016
303
+ },
304
+ {
305
+ "epoch": 4.92,
306
+ "learning_rate": 3.889114457390541e-05,
307
+ "loss": 0.2632,
308
+ "step": 90112
309
+ },
310
+ {
311
+ "epoch": 4.92,
312
+ "eval_loss": 0.4199689030647278,
313
+ "eval_runtime": 33.1245,
314
+ "eval_samples_per_second": 103.76,
315
+ "eval_steps_per_second": 6.491,
316
+ "step": 90112
317
+ },
318
+ {
319
+ "epoch": 5.15,
320
+ "learning_rate": 2.4664146067361558e-05,
321
+ "loss": 0.255,
322
+ "step": 94208
323
+ },
324
+ {
325
+ "epoch": 5.15,
326
+ "eval_loss": 0.42629772424697876,
327
+ "eval_runtime": 32.7944,
328
+ "eval_samples_per_second": 104.804,
329
+ "eval_steps_per_second": 6.556,
330
+ "step": 94208
331
+ },
332
+ {
333
+ "epoch": 5.37,
334
+ "learning_rate": 1.3536751160186934e-05,
335
+ "loss": 0.2515,
336
+ "step": 98304
337
+ },
338
+ {
339
+ "epoch": 5.37,
340
+ "eval_loss": 0.42379850149154663,
341
+ "eval_runtime": 32.9581,
342
+ "eval_samples_per_second": 104.284,
343
+ "eval_steps_per_second": 6.523,
344
+ "step": 98304
345
+ }
346
+ ],
347
+ "max_steps": 109800,
348
+ "num_train_epochs": 6,
349
+ "total_flos": 4.5697706840457216e+17,
350
+ "trial_name": null,
351
+ "trial_params": null
352
+ }
checkpoint-98304/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:908dc25e857dbdefbfe009b9ed0b992ce31760b393bf814f450b0d3c701c2397
3
+ size 3579
config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "activation_function": "gelu_new",
3
  "architectures": [
4
  "GPT2LMHeadModel"
 
1
  {
2
+ "_name_or_path": "JammyMachina/elec-gmusic-familized-model-13-12__17-35-53",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54af5d8875e2c2fd3cc37c56d33cad185fa27c7098ef23bdcb9ec77ecf847f0e
3
  size 105666297
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8611b00fb559665ca26e68dae93dfb4ebdf3717ac3a1f548409581450a0a18a4
3
  size 105666297
tokenizer_config.json CHANGED
@@ -1,4 +1,6 @@
1
  {
2
  "model_max_length": 1000000000000000019884624838656,
 
 
3
  "tokenizer_class": "PreTrainedTokenizerFast"
4
  }
 
1
  {
2
  "model_max_length": 1000000000000000019884624838656,
3
+ "name_or_path": "JammyMachina/elec-gmusic-familized-model-13-12__17-35-53",
4
+ "special_tokens_map_file": "/root/.cache/huggingface/hub/models--JammyMachina--elec-gmusic-familized-model-13-12__17-35-53/snapshots/fbba9d2ac598a2e0fbec338593aceff49347aff4/special_tokens_map.json",
5
  "tokenizer_class": "PreTrainedTokenizerFast"
6
  }
trainer_state.json CHANGED
@@ -1,249 +1,15 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.998776009791921,
5
- "global_step": 4284,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.36,
12
- "learning_rate": 0.0004997680752030479,
13
- "loss": 2.6834,
14
- "step": 256
15
- },
16
- {
17
- "epoch": 0.36,
18
- "eval_loss": 1.4117382764816284,
19
- "eval_runtime": 33.374,
20
- "eval_samples_per_second": 102.984,
21
- "eval_steps_per_second": 12.884,
22
- "step": 256
23
- },
24
- {
25
- "epoch": 0.72,
26
- "learning_rate": 0.0004928342476863796,
27
- "loss": 1.2147,
28
- "step": 512
29
- },
30
- {
31
- "epoch": 0.72,
32
- "eval_loss": 0.7461662292480469,
33
- "eval_runtime": 33.3586,
34
- "eval_samples_per_second": 103.032,
35
- "eval_steps_per_second": 12.89,
36
- "step": 512
37
- },
38
- {
39
- "epoch": 1.08,
40
- "learning_rate": 0.0004765136929861552,
41
- "loss": 0.7591,
42
- "step": 768
43
- },
44
- {
45
- "epoch": 1.08,
46
- "eval_loss": 0.5787664651870728,
47
- "eval_runtime": 33.4962,
48
- "eval_samples_per_second": 102.609,
49
- "eval_steps_per_second": 12.837,
50
- "step": 768
51
- },
52
- {
53
- "epoch": 1.43,
54
- "learning_rate": 0.0004514372800836444,
55
- "loss": 0.6031,
56
- "step": 1024
57
- },
58
- {
59
- "epoch": 1.43,
60
- "eval_loss": 0.5438796281814575,
61
- "eval_runtime": 33.4433,
62
- "eval_samples_per_second": 102.771,
63
- "eval_steps_per_second": 12.858,
64
- "step": 1024
65
- },
66
- {
67
- "epoch": 1.79,
68
- "learning_rate": 0.0004185743345533933,
69
- "loss": 0.5457,
70
- "step": 1280
71
- },
72
- {
73
- "epoch": 1.79,
74
- "eval_loss": 0.5130022168159485,
75
- "eval_runtime": 33.3724,
76
- "eval_samples_per_second": 102.989,
77
- "eval_steps_per_second": 12.885,
78
- "step": 1280
79
- },
80
- {
81
- "epoch": 2.15,
82
- "learning_rate": 0.00037919516940552946,
83
- "loss": 0.5108,
84
- "step": 1536
85
- },
86
- {
87
- "epoch": 2.15,
88
- "eval_loss": 0.5051754713058472,
89
- "eval_runtime": 33.3675,
90
- "eval_samples_per_second": 103.004,
91
- "eval_steps_per_second": 12.887,
92
- "step": 1536
93
- },
94
- {
95
- "epoch": 2.51,
96
- "learning_rate": 0.0003348219812967476,
97
- "loss": 0.4851,
98
- "step": 1792
99
- },
100
- {
101
- "epoch": 2.51,
102
- "eval_loss": 0.47963660955429077,
103
- "eval_runtime": 33.4622,
104
- "eval_samples_per_second": 102.713,
105
- "eval_steps_per_second": 12.85,
106
- "step": 1792
107
- },
108
- {
109
- "epoch": 2.87,
110
- "learning_rate": 0.00028717001021074106,
111
- "loss": 0.4644,
112
- "step": 2048
113
- },
114
- {
115
- "epoch": 2.87,
116
- "eval_loss": 0.475009560585022,
117
- "eval_runtime": 33.3614,
118
- "eval_samples_per_second": 103.023,
119
- "eval_steps_per_second": 12.889,
120
- "step": 2048
121
- },
122
- {
123
- "epoch": 3.23,
124
- "learning_rate": 0.00023808123707320163,
125
- "loss": 0.4477,
126
- "step": 2304
127
- },
128
- {
129
- "epoch": 3.23,
130
- "eval_loss": 0.4738583564758301,
131
- "eval_runtime": 33.4759,
132
- "eval_samples_per_second": 102.671,
133
- "eval_steps_per_second": 12.845,
134
- "step": 2304
135
- },
136
- {
137
- "epoch": 3.58,
138
- "learning_rate": 0.00018945318221170977,
139
- "loss": 0.4328,
140
- "step": 2560
141
- },
142
- {
143
- "epoch": 3.58,
144
- "eval_loss": 0.4598933756351471,
145
- "eval_runtime": 33.4184,
146
- "eval_samples_per_second": 102.847,
147
- "eval_steps_per_second": 12.867,
148
- "step": 2560
149
- },
150
- {
151
- "epoch": 3.94,
152
- "learning_rate": 0.00014316555694705608,
153
- "loss": 0.4204,
154
- "step": 2816
155
- },
156
- {
157
- "epoch": 3.94,
158
- "eval_loss": 0.4571586847305298,
159
- "eval_runtime": 33.4525,
160
- "eval_samples_per_second": 102.743,
161
- "eval_steps_per_second": 12.854,
162
- "step": 2816
163
- },
164
- {
165
- "epoch": 4.3,
166
- "learning_rate": 0.00010100760358947337,
167
- "loss": 0.4107,
168
- "step": 3072
169
- },
170
- {
171
- "epoch": 4.3,
172
- "eval_loss": 0.4567428529262543,
173
- "eval_runtime": 33.4087,
174
- "eval_samples_per_second": 102.877,
175
- "eval_steps_per_second": 12.871,
176
- "step": 3072
177
- },
178
- {
179
- "epoch": 4.66,
180
- "learning_rate": 6.460893250304736e-05,
181
- "loss": 0.4021,
182
- "step": 3328
183
- },
184
- {
185
- "epoch": 4.66,
186
- "eval_loss": 0.45427900552749634,
187
- "eval_runtime": 33.4427,
188
- "eval_samples_per_second": 102.773,
189
- "eval_steps_per_second": 12.858,
190
- "step": 3328
191
- },
192
- {
193
- "epoch": 5.02,
194
- "learning_rate": 3.5376529722851576e-05,
195
- "loss": 0.3974,
196
- "step": 3584
197
- },
198
- {
199
- "epoch": 5.02,
200
- "eval_loss": 0.44895419478416443,
201
- "eval_runtime": 33.38,
202
- "eval_samples_per_second": 102.966,
203
- "eval_steps_per_second": 12.882,
204
- "step": 3584
205
- },
206
- {
207
- "epoch": 5.38,
208
- "learning_rate": 1.444037008740992e-05,
209
- "loss": 0.3904,
210
- "step": 3840
211
- },
212
- {
213
- "epoch": 5.38,
214
- "eval_loss": 0.4500817358493805,
215
- "eval_runtime": 33.3223,
216
- "eval_samples_per_second": 103.144,
217
- "eval_steps_per_second": 12.904,
218
- "step": 3840
219
- },
220
- {
221
- "epoch": 5.74,
222
- "learning_rate": 2.60973820398705e-06,
223
- "loss": 0.389,
224
- "step": 4096
225
- },
226
- {
227
- "epoch": 5.74,
228
- "eval_loss": 0.44959455728530884,
229
- "eval_runtime": 33.4849,
230
- "eval_samples_per_second": 102.643,
231
- "eval_steps_per_second": 12.842,
232
- "step": 4096
233
- },
234
- {
235
- "epoch": 6.0,
236
- "step": 4284,
237
- "total_flos": 5.103180613869896e+17,
238
- "train_loss": 0.6478969901661022,
239
- "train_runtime": 51304.5286,
240
- "train_samples_per_second": 42.803,
241
- "train_steps_per_second": 0.084
242
- }
243
- ],
244
- "max_steps": 4284,
245
- "num_train_epochs": 6,
246
- "total_flos": 5.103180613869896e+17,
247
  "trial_name": null,
248
  "trial_params": null
249
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": null,
5
+ "global_step": 0,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
+ "log_history": [],
10
+ "max_steps": 0,
11
+ "num_train_epochs": 0,
12
+ "total_flos": 0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  "trial_name": null,
14
  "trial_params": null
15
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0206a7605fed29c8321de91dfcf793fb6150f09dc4e519e1121bb6e0b17b29fc
3
- size 3515
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:908dc25e857dbdefbfe009b9ed0b992ce31760b393bf814f450b0d3c701c2397
3
+ size 3579
training_args.json ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "models/elec-gmusic-familized",
3
+ "overwrite_output_dir": true,
4
+ "do_train": false,
5
+ "do_eval": true,
6
+ "do_predict": false,
7
+ "evaluation_strategy": "steps",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 10,
10
+ "per_device_eval_batch_size": 8,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 1,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "learning_rate": 0.0005,
17
+ "weight_decay": 0.1,
18
+ "adam_beta1": 0.9,
19
+ "adam_beta2": 0.999,
20
+ "adam_epsilon": 1e-08,
21
+ "max_grad_norm": 1.0,
22
+ "num_train_epochs": 6,
23
+ "max_steps": -1,
24
+ "lr_scheduler_type": "cosine",
25
+ "warmup_ratio": 0.0,
26
+ "warmup_steps": 200,
27
+ "log_level": "passive",
28
+ "log_level_replica": "passive",
29
+ "log_on_each_node": true,
30
+ "logging_dir": "models/elec-gmusic-familized/logs",
31
+ "logging_strategy": "steps",
32
+ "logging_first_step": false,
33
+ "logging_steps": 4096,
34
+ "logging_nan_inf_filter": true,
35
+ "save_strategy": "steps",
36
+ "save_steps": 16384,
37
+ "save_total_limit": 5,
38
+ "save_on_each_node": false,
39
+ "no_cuda": false,
40
+ "use_mps_device": false,
41
+ "seed": 42,
42
+ "data_seed": null,
43
+ "jit_mode_eval": false,
44
+ "use_ipex": false,
45
+ "bf16": false,
46
+ "fp16": true,
47
+ "fp16_opt_level": "O1",
48
+ "half_precision_backend": "cuda_amp",
49
+ "bf16_full_eval": false,
50
+ "fp16_full_eval": false,
51
+ "tf32": null,
52
+ "local_rank": -1,
53
+ "xpu_backend": null,
54
+ "tpu_num_cores": null,
55
+ "tpu_metrics_debug": false,
56
+ "debug": [],
57
+ "dataloader_drop_last": false,
58
+ "eval_steps": 4096,
59
+ "dataloader_num_workers": 0,
60
+ "past_index": -1,
61
+ "run_name": "models/elec-gmusic-familized",
62
+ "disable_tqdm": false,
63
+ "remove_unused_columns": true,
64
+ "label_names": null,
65
+ "load_best_model_at_end": false,
66
+ "metric_for_best_model": null,
67
+ "greater_is_better": null,
68
+ "ignore_data_skip": false,
69
+ "sharded_ddp": [],
70
+ "fsdp": [],
71
+ "fsdp_min_num_params": 0,
72
+ "fsdp_transformer_layer_cls_to_wrap": null,
73
+ "deepspeed": null,
74
+ "label_smoothing_factor": 0.0,
75
+ "optim": "adamw_hf",
76
+ "optim_args": null,
77
+ "adafactor": false,
78
+ "group_by_length": false,
79
+ "length_column_name": "length",
80
+ "report_to": [
81
+ "wandb"
82
+ ],
83
+ "ddp_find_unused_parameters": null,
84
+ "ddp_bucket_cap_mb": null,
85
+ "dataloader_pin_memory": true,
86
+ "skip_memory_metrics": true,
87
+ "use_legacy_prediction_loop": false,
88
+ "push_to_hub": true,
89
+ "resume_from_checkpoint": null,
90
+ "hub_model_id": "JammyMachina/elec-gmusic-familized-model-13-12__17-35-53",
91
+ "hub_strategy": "every_save",
92
+ "hub_token": "<HUB_TOKEN>",
93
+ "hub_private_repo": false,
94
+ "gradient_checkpointing": false,
95
+ "include_inputs_for_metrics": false,
96
+ "fp16_backend": "auto",
97
+ "push_to_hub_model_id": null,
98
+ "push_to_hub_organization": null,
99
+ "push_to_hub_token": "<PUSH_TO_HUB_TOKEN>",
100
+ "mp_parameters": "",
101
+ "auto_find_batch_size": false,
102
+ "full_determinism": false,
103
+ "torchdynamo": null,
104
+ "ray_scope": "last",
105
+ "ddp_timeout": 1800,
106
+ "torch_compile": false,
107
+ "torch_compile_backend": null,
108
+ "torch_compile_mode": null
109
+ }