Charlie911 commited on
Commit
72267fc
1 Parent(s): 4b6a456

Training in progress, step 900, checkpoint

Browse files
checkpoint-900/config.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "apple/OpenELM-1_1B-Instruct",
3
+ "activation_fn_name": "swish",
4
+ "architectures": [
5
+ "OpenELMForCausalLM"
6
+ ],
7
+ "auto_map": {
8
+ "AutoConfig": "apple/OpenELM-1_1B-Instruct--configuration_openelm.OpenELMConfig",
9
+ "AutoModelForCausalLM": "apple/OpenELM-1_1B-Instruct--modeling_openelm.OpenELMForCausalLM"
10
+ },
11
+ "bos_token_id": 1,
12
+ "eos_token_id": 2,
13
+ "ffn_dim_divisor": 256,
14
+ "ffn_multipliers": [
15
+ 0.5,
16
+ 0.63,
17
+ 0.76,
18
+ 0.89,
19
+ 1.02,
20
+ 1.15,
21
+ 1.28,
22
+ 1.41,
23
+ 1.54,
24
+ 1.67,
25
+ 1.8,
26
+ 1.93,
27
+ 2.06,
28
+ 2.19,
29
+ 2.31,
30
+ 2.44,
31
+ 2.57,
32
+ 2.7,
33
+ 2.83,
34
+ 2.96,
35
+ 3.09,
36
+ 3.22,
37
+ 3.35,
38
+ 3.48,
39
+ 3.61,
40
+ 3.74,
41
+ 3.87,
42
+ 4.0
43
+ ],
44
+ "ffn_with_glu": true,
45
+ "head_dim": 64,
46
+ "initializer_range": 0.02,
47
+ "max_context_length": 2048,
48
+ "model_dim": 2048,
49
+ "model_type": "openelm",
50
+ "normalization_layer_name": "rms_norm",
51
+ "normalize_qk_projections": true,
52
+ "num_gqa_groups": 4,
53
+ "num_kv_heads": [
54
+ 4,
55
+ 4,
56
+ 4,
57
+ 5,
58
+ 5,
59
+ 5,
60
+ 5,
61
+ 5,
62
+ 5,
63
+ 5,
64
+ 6,
65
+ 6,
66
+ 6,
67
+ 6,
68
+ 6,
69
+ 6,
70
+ 6,
71
+ 6,
72
+ 7,
73
+ 7,
74
+ 7,
75
+ 7,
76
+ 7,
77
+ 7,
78
+ 8,
79
+ 8,
80
+ 8,
81
+ 8
82
+ ],
83
+ "num_query_heads": [
84
+ 16,
85
+ 16,
86
+ 16,
87
+ 20,
88
+ 20,
89
+ 20,
90
+ 20,
91
+ 20,
92
+ 20,
93
+ 20,
94
+ 24,
95
+ 24,
96
+ 24,
97
+ 24,
98
+ 24,
99
+ 24,
100
+ 24,
101
+ 24,
102
+ 28,
103
+ 28,
104
+ 28,
105
+ 28,
106
+ 28,
107
+ 28,
108
+ 32,
109
+ 32,
110
+ 32,
111
+ 32
112
+ ],
113
+ "num_transformer_layers": 28,
114
+ "qkv_multipliers": [
115
+ 0.5,
116
+ 1.0
117
+ ],
118
+ "rope_freq_constant": 10000,
119
+ "rope_max_length": 4096,
120
+ "share_input_output_layers": true,
121
+ "torch_dtype": "bfloat16",
122
+ "transformers_version": "4.41.2",
123
+ "use_cache": true,
124
+ "vocab_size": 32000
125
+ }
checkpoint-900/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.41.2"
6
+ }
checkpoint-900/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73e6455df62b7d13beb09e17aeb91faf1f0f29599a29c237903b6bbe8c5e55d4
3
+ size 2159808696
checkpoint-900/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9184b16bca0a92f25d02bb71105e08cca2283d20d669418c9c63a8dd2070d030
3
+ size 4319755230
checkpoint-900/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
3
+ size 14244
checkpoint-900/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc90ea87b143658760046b7ac9565c9708060f4ca439ba131cfe2e74fdf238a1
3
+ size 1064
checkpoint-900/trainer_state.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.48982924008435946,
5
+ "eval_steps": 500,
6
+ "global_step": 900,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05442547112048439,
13
+ "grad_norm": 0.8046875,
14
+ "learning_rate": 0.0013043478260869564,
15
+ "loss": 2.4382,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.10885094224096878,
20
+ "grad_norm": 0.70703125,
21
+ "learning_rate": 0.002399500708356285,
22
+ "loss": 2.862,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.16327641336145315,
27
+ "grad_norm": 0.68359375,
28
+ "learning_rate": 0.0023738600932376305,
29
+ "loss": 2.8764,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.21770188448193756,
34
+ "grad_norm": 0.79296875,
35
+ "learning_rate": 0.0023102679420456433,
36
+ "loss": 2.7571,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.27212735560242196,
41
+ "grad_norm": 0.55859375,
42
+ "learning_rate": 0.002211014330887608,
43
+ "loss": 2.6693,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.3265528267229063,
48
+ "grad_norm": 0.453125,
49
+ "learning_rate": 0.0020796735738764158,
50
+ "loss": 2.6112,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.3809782978433907,
55
+ "grad_norm": 0.5078125,
56
+ "learning_rate": 0.0019209755051803723,
57
+ "loss": 2.5635,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.4354037689638751,
62
+ "grad_norm": 0.498046875,
63
+ "learning_rate": 0.0017406351485654726,
64
+ "loss": 2.5177,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.48982924008435946,
69
+ "grad_norm": 0.34375,
70
+ "learning_rate": 0.00154514690835869,
71
+ "loss": 2.4643,
72
+ "step": 900
73
+ }
74
+ ],
75
+ "logging_steps": 100,
76
+ "max_steps": 1837,
77
+ "num_input_tokens_seen": 0,
78
+ "num_train_epochs": 1,
79
+ "save_steps": 100,
80
+ "stateful_callbacks": {
81
+ "TrainerControl": {
82
+ "args": {
83
+ "should_epoch_stop": false,
84
+ "should_evaluate": false,
85
+ "should_log": false,
86
+ "should_save": true,
87
+ "should_training_stop": false
88
+ },
89
+ "attributes": {}
90
+ }
91
+ },
92
+ "total_flos": 1.2923622902790144e+17,
93
+ "train_batch_size": 4,
94
+ "trial_name": null,
95
+ "trial_params": null
96
+ }
checkpoint-900/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:927397e3adbbbc91f7921b6dd111d1127d662b8e582f434739370dd9d2c1bbfa
3
+ size 5240