Charlie911 commited on
Commit
c5df162
1 Parent(s): 9fb8893

Training in progress, step 600, checkpoint

Browse files
checkpoint-600/config.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "apple/OpenELM-1_1B-Instruct",
3
+ "activation_fn_name": "swish",
4
+ "architectures": [
5
+ "OpenELMForCausalLM"
6
+ ],
7
+ "auto_map": {
8
+ "AutoConfig": "apple/OpenELM-1_1B-Instruct--configuration_openelm.OpenELMConfig",
9
+ "AutoModelForCausalLM": "apple/OpenELM-1_1B-Instruct--modeling_openelm.OpenELMForCausalLM"
10
+ },
11
+ "bos_token_id": 1,
12
+ "eos_token_id": 2,
13
+ "ffn_dim_divisor": 256,
14
+ "ffn_multipliers": [
15
+ 0.5,
16
+ 0.63,
17
+ 0.76,
18
+ 0.89,
19
+ 1.02,
20
+ 1.15,
21
+ 1.28,
22
+ 1.41,
23
+ 1.54,
24
+ 1.67,
25
+ 1.8,
26
+ 1.93,
27
+ 2.06,
28
+ 2.19,
29
+ 2.31,
30
+ 2.44,
31
+ 2.57,
32
+ 2.7,
33
+ 2.83,
34
+ 2.96,
35
+ 3.09,
36
+ 3.22,
37
+ 3.35,
38
+ 3.48,
39
+ 3.61,
40
+ 3.74,
41
+ 3.87,
42
+ 4.0
43
+ ],
44
+ "ffn_with_glu": true,
45
+ "head_dim": 64,
46
+ "initializer_range": 0.02,
47
+ "max_context_length": 2048,
48
+ "model_dim": 2048,
49
+ "model_type": "openelm",
50
+ "normalization_layer_name": "rms_norm",
51
+ "normalize_qk_projections": true,
52
+ "num_gqa_groups": 4,
53
+ "num_kv_heads": [
54
+ 4,
55
+ 4,
56
+ 4,
57
+ 5,
58
+ 5,
59
+ 5,
60
+ 5,
61
+ 5,
62
+ 5,
63
+ 5,
64
+ 6,
65
+ 6,
66
+ 6,
67
+ 6,
68
+ 6,
69
+ 6,
70
+ 6,
71
+ 6,
72
+ 7,
73
+ 7,
74
+ 7,
75
+ 7,
76
+ 7,
77
+ 7,
78
+ 8,
79
+ 8,
80
+ 8,
81
+ 8
82
+ ],
83
+ "num_query_heads": [
84
+ 16,
85
+ 16,
86
+ 16,
87
+ 20,
88
+ 20,
89
+ 20,
90
+ 20,
91
+ 20,
92
+ 20,
93
+ 20,
94
+ 24,
95
+ 24,
96
+ 24,
97
+ 24,
98
+ 24,
99
+ 24,
100
+ 24,
101
+ 24,
102
+ 28,
103
+ 28,
104
+ 28,
105
+ 28,
106
+ 28,
107
+ 28,
108
+ 32,
109
+ 32,
110
+ 32,
111
+ 32
112
+ ],
113
+ "num_transformer_layers": 28,
114
+ "qkv_multipliers": [
115
+ 0.5,
116
+ 1.0
117
+ ],
118
+ "rope_freq_constant": 10000,
119
+ "rope_max_length": 4096,
120
+ "share_input_output_layers": true,
121
+ "torch_dtype": "bfloat16",
122
+ "transformers_version": "4.41.2",
123
+ "use_cache": true,
124
+ "vocab_size": 32000
125
+ }
checkpoint-600/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.41.2"
6
+ }
checkpoint-600/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc6334b8411169391f7b8c68d53ea7e704200857a183db7a310261376db39fab
3
+ size 2159808696
checkpoint-600/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97845799fd3068f8376441257052ec8f833c09cd232c80486d1a810de4a46d3f
3
+ size 4319755230
checkpoint-600/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
3
+ size 14244
checkpoint-600/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96336fc7c4f3076469960d6ff19d6a2ff5510b1b9956c969b8e97d8b25081a1f
3
+ size 1064
checkpoint-600/trainer_state.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.3265528267229063,
5
+ "eval_steps": 500,
6
+ "global_step": 600,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05442547112048439,
13
+ "grad_norm": 0.8046875,
14
+ "learning_rate": 0.0013043478260869564,
15
+ "loss": 2.4382,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.10885094224096878,
20
+ "grad_norm": 0.70703125,
21
+ "learning_rate": 0.002399500708356285,
22
+ "loss": 2.862,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.16327641336145315,
27
+ "grad_norm": 0.68359375,
28
+ "learning_rate": 0.0023738600932376305,
29
+ "loss": 2.8764,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.21770188448193756,
34
+ "grad_norm": 0.79296875,
35
+ "learning_rate": 0.0023102679420456433,
36
+ "loss": 2.7571,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.27212735560242196,
41
+ "grad_norm": 0.55859375,
42
+ "learning_rate": 0.002211014330887608,
43
+ "loss": 2.6693,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.3265528267229063,
48
+ "grad_norm": 0.453125,
49
+ "learning_rate": 0.0020796735738764158,
50
+ "loss": 2.6112,
51
+ "step": 600
52
+ }
53
+ ],
54
+ "logging_steps": 100,
55
+ "max_steps": 1837,
56
+ "num_input_tokens_seen": 0,
57
+ "num_train_epochs": 1,
58
+ "save_steps": 100,
59
+ "stateful_callbacks": {
60
+ "TrainerControl": {
61
+ "args": {
62
+ "should_epoch_stop": false,
63
+ "should_evaluate": false,
64
+ "should_log": false,
65
+ "should_save": true,
66
+ "should_training_stop": false
67
+ },
68
+ "attributes": {}
69
+ }
70
+ },
71
+ "total_flos": 8.618896349711155e+16,
72
+ "train_batch_size": 4,
73
+ "trial_name": null,
74
+ "trial_params": null
75
+ }
checkpoint-600/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:927397e3adbbbc91f7921b6dd111d1127d662b8e582f434739370dd9d2c1bbfa
3
+ size 5240