Text Generation
scaling
GregorZiegltrumAA commited on
Commit
0b781cd
1 Parent(s): ef7e0a3
Files changed (37) hide show
  1. config.yml +79 -0
  2. model_state_layer_0_EmbeddingInput.pt +3 -0
  3. model_state_layer_10_TransformerLayer.pt +3 -0
  4. model_state_layer_11_TransformerLayer.pt +3 -0
  5. model_state_layer_12_TransformerLayer.pt +3 -0
  6. model_state_layer_13_TransformerLayer.pt +3 -0
  7. model_state_layer_14_TransformerLayer.pt +3 -0
  8. model_state_layer_15_TransformerLayer.pt +3 -0
  9. model_state_layer_16_TransformerLayer.pt +3 -0
  10. model_state_layer_17_TransformerLayer.pt +3 -0
  11. model_state_layer_18_TransformerLayer.pt +3 -0
  12. model_state_layer_19_TransformerLayer.pt +3 -0
  13. model_state_layer_1_TransformerLayer.pt +3 -0
  14. model_state_layer_20_TransformerLayer.pt +3 -0
  15. model_state_layer_21_TransformerLayer.pt +3 -0
  16. model_state_layer_22_TransformerLayer.pt +3 -0
  17. model_state_layer_23_TransformerLayer.pt +3 -0
  18. model_state_layer_24_TransformerLayer.pt +3 -0
  19. model_state_layer_25_TransformerLayer.pt +3 -0
  20. model_state_layer_26_TransformerLayer.pt +3 -0
  21. model_state_layer_27_TransformerLayer.pt +3 -0
  22. model_state_layer_28_TransformerLayer.pt +3 -0
  23. model_state_layer_29_TransformerLayer.pt +3 -0
  24. model_state_layer_2_TransformerLayer.pt +3 -0
  25. model_state_layer_30_TransformerLayer.pt +3 -0
  26. model_state_layer_31_TransformerLayer.pt +3 -0
  27. model_state_layer_32_TransformerLayer.pt +3 -0
  28. model_state_layer_33_LayerNormWrapper.pt +3 -0
  29. model_state_layer_34_TransformerLMHead.pt +3 -0
  30. model_state_layer_3_TransformerLayer.pt +3 -0
  31. model_state_layer_4_TransformerLayer.pt +3 -0
  32. model_state_layer_5_TransformerLayer.pt +3 -0
  33. model_state_layer_6_TransformerLayer.pt +3 -0
  34. model_state_layer_7_TransformerLayer.pt +3 -0
  35. model_state_layer_8_TransformerLayer.pt +3 -0
  36. model_state_layer_9_TransformerLayer.pt +3 -0
  37. vocab.json +0 -0
config.yml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ optimizer:
2
+ allreduce_bucket_size: 500000000
3
+ beta1: 0.9
4
+ beta2: 0.95
5
+ debug_log: false
6
+ eps: 1e-08
7
+ gradient_clipping: 0.0
8
+ zero: true
9
+ zero_save_static: false
10
+ topology:
11
+ activation_checkpointing_type: disabled
12
+ global_batch_size: 1024
13
+ gradient_accumulation_steps: 8
14
+ micro_batch_size: 1
15
+ model_parallel_size: 1
16
+ pipe_parallel_size: 2
17
+ pipe_partition_method: balanced
18
+ pipe_partition_overwrite: null
19
+ sequence_parallel: false
20
+ trainer:
21
+ seed: 42
22
+ train_iterations: 72000
23
+ training:
24
+ allow_missing_params_in_optimizer: true
25
+ training_groups:
26
+ - group_name: param_group
27
+ independent_weight_decay: false
28
+ learning_rate_scheduler:
29
+ learning_rate: 0.0003
30
+ learning_rate_decay_iters: 72000
31
+ learning_rate_decay_style: cosine
32
+ learning_rate_minimum: 3e-05
33
+ learning_rate_warmup_steps: 500
34
+ parameters_exclude: []
35
+ weight_decay: 0.1
36
+ transformer_architecture:
37
+ attention_bias: false
38
+ attention_num_kv_heads: null
39
+ attention_qkv_in_one: true
40
+ dropout_after_attention: 0.0
41
+ dropout_after_mlp: 0.0
42
+ dropout_attention_probs: 0.0
43
+ dropout_embedding: 0.0
44
+ dropout_image_encoder: 0.0
45
+ hidden_size: 4096
46
+ image_encoder: false
47
+ key_query_norm: false
48
+ layernorm:
49
+ layernorm_epsilon: 1e-05
50
+ optimization_type: torch
51
+ local_attention_window_size: null
52
+ masked_softmax:
53
+ kernel: flash_attention
54
+ scale: 1.0
55
+ softmax_in_fp32: false
56
+ mlp_bias: false
57
+ mlp_factor: 2.66796875
58
+ mlp_type: swiglu
59
+ norm_type: rms
60
+ num_attention_heads: 32
61
+ num_layers: 32
62
+ num_local_attention_heads: 0
63
+ precision: bfloat16
64
+ relative_position_embedding_type: rotary_complex
65
+ reset_attention_mask: false
66
+ reset_position_ids: false
67
+ rotary_embedding_base: 10000
68
+ rotary_percentage: 1.0
69
+ sequence_length: 4096
70
+ umup:
71
+ act_mult: 1.0
72
+ attn_mult: 1.0
73
+ enable: false
74
+ loss_mult: 1.0
75
+ residual_attn_ratio: 1.0
76
+ residual_mult: 1.0
77
+ vocab_file: null
78
+ vocab_size: 65536
79
+ weight_tying: false
model_state_layer_0_EmbeddingInput.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db3a9d7e85ececaa362b11d794ba4cc702ea4e9d5b3103eb0d244ba3b28ab68c
3
+ size 536872395
model_state_layer_10_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7f64f4339b9818eaecb52cca991b45c81e0a2616acebcc8c54f6e4481fd7462
3
+ size 402803885
model_state_layer_11_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83bec24a802615c9ca106ce57d0d0d8415cfbc641de68b612f6d97b0e3bd6ed4
3
+ size 402803885
model_state_layer_12_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b5ef026010b0c065940b691bc9b5f601698a800656abe45a276ac8c53f9b74c
3
+ size 402803885
model_state_layer_13_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dc6fe8f49aa40b0828f23d525d5ca4fec65168bf090552cb0461af8786cdce6
3
+ size 402803885
model_state_layer_14_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:450e9062d73485f468f911a55fcbf77cc8cbac5c36d6d72654ae0ee7a9b5278e
3
+ size 402803885
model_state_layer_15_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:337bfa956d740e45435317c07790ae3806bbf7786d893034f37028473300ce14
3
+ size 402803885
model_state_layer_16_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21df8c7655022de1b46e041ed9918ddc5103fb2d13e99f1901183bb69dac7baa
3
+ size 402803885
model_state_layer_17_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c57b5319a959ffe5819ce6500935ad2f38853b1a9609490d0f7253b18a80c040
3
+ size 402803885
model_state_layer_18_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ddb3a378c6eb0efa65d16dc99ccf69acc847d2dd2bf89b1b5ab08423243ab85
3
+ size 402803885
model_state_layer_19_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1cf07b479e6c8c21cf6254962be145928a66d94022298546c3cbc2d4693f5c1
3
+ size 402803885
model_state_layer_1_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a0953aee2d52eff3c04e7e6be88609780ceb49afbdfb7a48888d65b9a638293
3
+ size 402803874
model_state_layer_20_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b90412d8e8512387a3be56e8776feba9bea8b23f46f0b801ce695fced782e4ed
3
+ size 402803885
model_state_layer_21_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:416511ba15caa0b3231cb875b5ead45c45adb27aa773db5441decf0affdaa6a9
3
+ size 402803885
model_state_layer_22_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fb53b72db3f69aecc5a3106bd34ef57c74b63db8bd5caf94b17008c12be5113
3
+ size 402803885
model_state_layer_23_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ce43a59282d619f4e1e86e7103e1a27471c0a4fbd7ad20067376a6b5314a64a
3
+ size 402803885
model_state_layer_24_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:207f4d73823910c8cf07d72c557d53fbed0fe25866aaabfb5d6fa10374575d96
3
+ size 402803885
model_state_layer_25_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60227a5d79d26dcda13c96116574b9529b7d467ef2b8257d15af601f4ec72e24
3
+ size 402803885
model_state_layer_26_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c84f768cb95b1daf8b513041ee910009672a1b8471180d5080fdf15c376c1ff4
3
+ size 402803885
model_state_layer_27_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a628bc469cdfe00f9f34144ca67b1500ff9171f4050c423c9703e7609183273
3
+ size 402803885
model_state_layer_28_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d415dd867bf7aa8fa39f7fd7730131c8e149c2ddafd5b4128f72c87b2e8c56ad
3
+ size 402803885
model_state_layer_29_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92f0dab41b00b5a80f9860f0072808e6f2eba40791056fda11b01b76ccd753bb
3
+ size 402803885
model_state_layer_2_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1289782c306a10ddf9c3c4f00cbab98adcfb8092dab9c5dfdd3e7de0cd9881d
3
+ size 402803874
model_state_layer_30_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe3d214468807be82f5f559c12aa8a198b88a763538edffcacc1487832af4c7e
3
+ size 402803885
model_state_layer_31_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55c6d955811f8e64ce5f6f1db85d209f82b1699154816e06d8b561f24661abee
3
+ size 402803885
model_state_layer_32_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca470a34c8cfffdfb88160b4f7e1050e06763045385f73446f7478656d2c76f4
3
+ size 402803885
model_state_layer_33_LayerNormWrapper.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8c8281b03be45b8dd25c6c72b6b44964a7ac1c595ab91abda4c136d9708c0fc
3
+ size 9650
model_state_layer_34_TransformerLMHead.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89bca75db8775f40c4dcebc3edf0fa19927aa6f63dd0399ce48dcb5865a28a32
3
+ size 536872360
model_state_layer_3_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a235f28c9aa19c280d9685421ea7cf85f9d25e6822c1a97c671d1e001eae5c1a
3
+ size 402803874
model_state_layer_4_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9107e3e2b31f27307d540c0696e12f4e328324aa2997db70743eea1be9cc60bf
3
+ size 402803874
model_state_layer_5_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0664480ca5c512aa015b832a9381c9475e67ba316d1fdc1ffbd18fb81efee06
3
+ size 402803874
model_state_layer_6_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53646a507cfaf7ea44288993034d40b6d6528511ff56d8986d86f0772c5f4032
3
+ size 402803874
model_state_layer_7_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:180ec2fca7563a85e1e52765885b7a25f685c8f549fdd9e50fe9ec68732c79a0
3
+ size 402803874
model_state_layer_8_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5728b6651ea8ff001513b2a0e2d7463c62fbf021df8ed8524fdda267d787b9ba
3
+ size 402803874
model_state_layer_9_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1893c6df68ece85e3d97e895257bd3de3a15147c09d83088e107b2e50ff8eaf
3
+ size 402803874
vocab.json ADDED
The diff for this file is too large to render. See raw diff