Text Generation
scaling
GregorZiegltrumAA commited on
Commit
8385ecd
·
1 Parent(s): 97f66e4
config.yml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ optimizer:
2
+ allreduce_bucket_size: 500000000
3
+ beta1: 0.9
4
+ beta2: 0.95
5
+ debug_log: false
6
+ eps: 1e-08
7
+ gradient_clipping: 0.0
8
+ zero: true
9
+ zero_save_static: false
10
+ topology:
11
+ activation_checkpointing_type: disabled
12
+ global_batch_size: 1024
13
+ gradient_accumulation_steps: 2
14
+ micro_batch_size: 2
15
+ model_parallel_size: 1
16
+ pipe_parallel_size: 1
17
+ pipe_partition_method: balanced
18
+ pipe_partition_overwrite: null
19
+ sequence_parallel: false
20
+ trainer:
21
+ seed: 42
22
+ train_iterations: 72000
23
+ training:
24
+ allow_missing_params_in_optimizer: true
25
+ training_groups:
26
+ - group_name: param_group
27
+ independent_weight_decay: true
28
+ learning_rate_scheduler:
29
+ learning_rate: 11.313708498984761
30
+ learning_rate_decay_iters: 72000
31
+ learning_rate_decay_style: cosine
32
+ learning_rate_minimum: 1.131370849898476
33
+ learning_rate_warmup_steps: 500
34
+ parameters_exclude:
35
+ - norm
36
+ weight_decay: 0.0001221
37
+ transformer_architecture:
38
+ attention_bias: false
39
+ attention_num_kv_heads: null
40
+ attention_qkv_in_one: true
41
+ dropout_after_attention: 0.0
42
+ dropout_after_mlp: 0.0
43
+ dropout_attention_probs: 0.0
44
+ dropout_embedding: 0.0
45
+ dropout_image_encoder: 0.0
46
+ fp8_config_attention:
47
+ dtypes_forward:
48
+ left_dtype: e4m3
49
+ right_dtype: e4m3
50
+ dtypes_grad_input:
51
+ left_dtype: e5m2
52
+ right_dtype: e4m3
53
+ dtypes_grad_weight:
54
+ left_dtype: e4m3
55
+ right_dtype: e5m2
56
+ fp8_config_mlp:
57
+ dtypes_forward:
58
+ left_dtype: e4m3
59
+ right_dtype: e4m3
60
+ dtypes_grad_input:
61
+ left_dtype: e5m2
62
+ right_dtype: e4m3
63
+ dtypes_grad_weight:
64
+ left_dtype: e4m3
65
+ right_dtype: e5m2
66
+ hidden_size: 2048
67
+ image_encoder: false
68
+ key_query_norm: false
69
+ layernorm:
70
+ layernorm_epsilon: 1e-05
71
+ optimization_type: torch
72
+ local_attention_window_size: null
73
+ masked_softmax:
74
+ kernel: flash_attention
75
+ scale: 1.0
76
+ softmax_in_fp32: false
77
+ mlp_bias: false
78
+ mlp_factor: 2.6640625
79
+ mlp_type: swiglu
80
+ norm_type: rms
81
+ num_attention_heads: 16
82
+ num_layers: 16
83
+ num_local_attention_heads: 0
84
+ precision: bfloat16
85
+ relative_position_embedding_type: rotary_complex
86
+ reset_attention_mask: false
87
+ reset_position_ids: false
88
+ rotary_embedding_base: 10000
89
+ rotary_percentage: 1.0
90
+ sequence_length: 4096
91
+ umup:
92
+ act_mult: 1.0
93
+ attn_mult: 1.0
94
+ enable: true
95
+ loss_mult: 1.0
96
+ normalize_depth_to_num_layers: true
97
+ residual_attn_ratio: 0.25
98
+ residual_mult: 1.0
99
+ vocab_file: null
100
+ vocab_size: 65536
101
+ weight_tying: false
model_state_layer_0_EmbeddingInput.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fbc36b1b5de1f8c0654a87aa04e39bffeff77e8142789b052d3d996170b4646
3
+ size 268436939
model_state_layer_10_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89d1166a4aca97dea1add692e78f704cf7ae4ed042ecc929c8f1de36c35ed37f
3
+ size 100609197
model_state_layer_11_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:272841382ca3c7ce1b098c7e85e0c0a46d707d8df13488edec01cc14f1f89e6d
3
+ size 100609197
model_state_layer_12_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:780caf8f6ddef73dc6f0cc232dd15ed700ef15d4246b98635a2dbcbda8b9b0a0
3
+ size 100609197
model_state_layer_13_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34298185508c3b356d231e93a98b96aa06b17688beb8c491611d0f6e53747b3e
3
+ size 100609197
model_state_layer_14_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65d5ba4ad31613e4476611aa8cca30b241b5b4f3e2ac28bc23986d6944ebfaf7
3
+ size 100609197
model_state_layer_15_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edf5b416e1d1a671ff8c400261f58ddbb96927d0e3f7b3ec06b34c98378319ee
3
+ size 100609197
model_state_layer_16_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bb1e9cbcc6b35b7ea8ae358849f44683bf3b8de9e65974fbbddcbda33db0329
3
+ size 100609197
model_state_layer_17_LayerNormWrapper.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e657e00cad67691e2875f1bb3b943a2d86c17a326840572359f08e08e2f5b6cc
3
+ size 5554
model_state_layer_18_TransformerLMHead.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db5bf94e56daf347abcb0a6460c46a9b0ce21b1a5b29adedf2adc17a4cf973e7
3
+ size 268436904
model_state_layer_1_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0c61714acb70fd0d08767834c71248a68fe3a190a1d9566eabb7eb50088276c
3
+ size 100609186
model_state_layer_2_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5326f1fd2b68509ebb591711eb8be77daa506c6af9c1a9acc93f73d3b96bd0a
3
+ size 100609186
model_state_layer_3_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a07667f3ed172d21a1c398051bfba52cc47bedc1a5b2a7e24de32a4949659b5d
3
+ size 100609186
model_state_layer_4_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c778568fea16842d20b696a66f97af7295bd045aff360db457cc6c5135fcbef6
3
+ size 100609186
model_state_layer_5_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc381814789d626afc7c6cdc73f2d91982872cfebb3e90505e767074e697e341
3
+ size 100609186
model_state_layer_6_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:534d9bcf4e5beecfbaf900f61f2cbcbcb85bfa4493429b979577bf89b26047ad
3
+ size 100609186
model_state_layer_7_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d18824aa2a28cbf28bc1ebed8e5361ac1f043d8fd2a1f98a487b13e458a8992
3
+ size 100609186
model_state_layer_8_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:916d82a54298804d1c7e054ab5c1f18245f6022f2a72071c6e3553523c62821b
3
+ size 100609186
model_state_layer_9_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de7054675fa0cbaee27099032b2cb11ba452d00eb2124e354b89f966e21dfac6
3
+ size 100609186
vocab.json ADDED
The diff for this file is too large to render. See raw diff