GregorZiegltrumAA
commited on
Commit
•
0b781cd
1
Parent(s):
ef7e0a3
Add model
Browse files- config.yml +79 -0
- model_state_layer_0_EmbeddingInput.pt +3 -0
- model_state_layer_10_TransformerLayer.pt +3 -0
- model_state_layer_11_TransformerLayer.pt +3 -0
- model_state_layer_12_TransformerLayer.pt +3 -0
- model_state_layer_13_TransformerLayer.pt +3 -0
- model_state_layer_14_TransformerLayer.pt +3 -0
- model_state_layer_15_TransformerLayer.pt +3 -0
- model_state_layer_16_TransformerLayer.pt +3 -0
- model_state_layer_17_TransformerLayer.pt +3 -0
- model_state_layer_18_TransformerLayer.pt +3 -0
- model_state_layer_19_TransformerLayer.pt +3 -0
- model_state_layer_1_TransformerLayer.pt +3 -0
- model_state_layer_20_TransformerLayer.pt +3 -0
- model_state_layer_21_TransformerLayer.pt +3 -0
- model_state_layer_22_TransformerLayer.pt +3 -0
- model_state_layer_23_TransformerLayer.pt +3 -0
- model_state_layer_24_TransformerLayer.pt +3 -0
- model_state_layer_25_TransformerLayer.pt +3 -0
- model_state_layer_26_TransformerLayer.pt +3 -0
- model_state_layer_27_TransformerLayer.pt +3 -0
- model_state_layer_28_TransformerLayer.pt +3 -0
- model_state_layer_29_TransformerLayer.pt +3 -0
- model_state_layer_2_TransformerLayer.pt +3 -0
- model_state_layer_30_TransformerLayer.pt +3 -0
- model_state_layer_31_TransformerLayer.pt +3 -0
- model_state_layer_32_TransformerLayer.pt +3 -0
- model_state_layer_33_LayerNormWrapper.pt +3 -0
- model_state_layer_34_TransformerLMHead.pt +3 -0
- model_state_layer_3_TransformerLayer.pt +3 -0
- model_state_layer_4_TransformerLayer.pt +3 -0
- model_state_layer_5_TransformerLayer.pt +3 -0
- model_state_layer_6_TransformerLayer.pt +3 -0
- model_state_layer_7_TransformerLayer.pt +3 -0
- model_state_layer_8_TransformerLayer.pt +3 -0
- model_state_layer_9_TransformerLayer.pt +3 -0
- vocab.json +0 -0
config.yml
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
optimizer:
|
2 |
+
allreduce_bucket_size: 500000000
|
3 |
+
beta1: 0.9
|
4 |
+
beta2: 0.95
|
5 |
+
debug_log: false
|
6 |
+
eps: 1e-08
|
7 |
+
gradient_clipping: 0.0
|
8 |
+
zero: true
|
9 |
+
zero_save_static: false
|
10 |
+
topology:
|
11 |
+
activation_checkpointing_type: disabled
|
12 |
+
global_batch_size: 1024
|
13 |
+
gradient_accumulation_steps: 8
|
14 |
+
micro_batch_size: 1
|
15 |
+
model_parallel_size: 1
|
16 |
+
pipe_parallel_size: 2
|
17 |
+
pipe_partition_method: balanced
|
18 |
+
pipe_partition_overwrite: null
|
19 |
+
sequence_parallel: false
|
20 |
+
trainer:
|
21 |
+
seed: 42
|
22 |
+
train_iterations: 72000
|
23 |
+
training:
|
24 |
+
allow_missing_params_in_optimizer: true
|
25 |
+
training_groups:
|
26 |
+
- group_name: param_group
|
27 |
+
independent_weight_decay: false
|
28 |
+
learning_rate_scheduler:
|
29 |
+
learning_rate: 0.0003
|
30 |
+
learning_rate_decay_iters: 72000
|
31 |
+
learning_rate_decay_style: cosine
|
32 |
+
learning_rate_minimum: 3e-05
|
33 |
+
learning_rate_warmup_steps: 500
|
34 |
+
parameters_exclude: []
|
35 |
+
weight_decay: 0.1
|
36 |
+
transformer_architecture:
|
37 |
+
attention_bias: false
|
38 |
+
attention_num_kv_heads: null
|
39 |
+
attention_qkv_in_one: true
|
40 |
+
dropout_after_attention: 0.0
|
41 |
+
dropout_after_mlp: 0.0
|
42 |
+
dropout_attention_probs: 0.0
|
43 |
+
dropout_embedding: 0.0
|
44 |
+
dropout_image_encoder: 0.0
|
45 |
+
hidden_size: 4096
|
46 |
+
image_encoder: false
|
47 |
+
key_query_norm: false
|
48 |
+
layernorm:
|
49 |
+
layernorm_epsilon: 1e-05
|
50 |
+
optimization_type: torch
|
51 |
+
local_attention_window_size: null
|
52 |
+
masked_softmax:
|
53 |
+
kernel: flash_attention
|
54 |
+
scale: 1.0
|
55 |
+
softmax_in_fp32: false
|
56 |
+
mlp_bias: false
|
57 |
+
mlp_factor: 2.66796875
|
58 |
+
mlp_type: swiglu
|
59 |
+
norm_type: rms
|
60 |
+
num_attention_heads: 32
|
61 |
+
num_layers: 32
|
62 |
+
num_local_attention_heads: 0
|
63 |
+
precision: bfloat16
|
64 |
+
relative_position_embedding_type: rotary_complex
|
65 |
+
reset_attention_mask: false
|
66 |
+
reset_position_ids: false
|
67 |
+
rotary_embedding_base: 10000
|
68 |
+
rotary_percentage: 1.0
|
69 |
+
sequence_length: 4096
|
70 |
+
umup:
|
71 |
+
act_mult: 1.0
|
72 |
+
attn_mult: 1.0
|
73 |
+
enable: false
|
74 |
+
loss_mult: 1.0
|
75 |
+
residual_attn_ratio: 1.0
|
76 |
+
residual_mult: 1.0
|
77 |
+
vocab_file: null
|
78 |
+
vocab_size: 65536
|
79 |
+
weight_tying: false
|
model_state_layer_0_EmbeddingInput.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db3a9d7e85ececaa362b11d794ba4cc702ea4e9d5b3103eb0d244ba3b28ab68c
|
3 |
+
size 536872395
|
model_state_layer_10_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7f64f4339b9818eaecb52cca991b45c81e0a2616acebcc8c54f6e4481fd7462
|
3 |
+
size 402803885
|
model_state_layer_11_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:83bec24a802615c9ca106ce57d0d0d8415cfbc641de68b612f6d97b0e3bd6ed4
|
3 |
+
size 402803885
|
model_state_layer_12_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1b5ef026010b0c065940b691bc9b5f601698a800656abe45a276ac8c53f9b74c
|
3 |
+
size 402803885
|
model_state_layer_13_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9dc6fe8f49aa40b0828f23d525d5ca4fec65168bf090552cb0461af8786cdce6
|
3 |
+
size 402803885
|
model_state_layer_14_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:450e9062d73485f468f911a55fcbf77cc8cbac5c36d6d72654ae0ee7a9b5278e
|
3 |
+
size 402803885
|
model_state_layer_15_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:337bfa956d740e45435317c07790ae3806bbf7786d893034f37028473300ce14
|
3 |
+
size 402803885
|
model_state_layer_16_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:21df8c7655022de1b46e041ed9918ddc5103fb2d13e99f1901183bb69dac7baa
|
3 |
+
size 402803885
|
model_state_layer_17_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c57b5319a959ffe5819ce6500935ad2f38853b1a9609490d0f7253b18a80c040
|
3 |
+
size 402803885
|
model_state_layer_18_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ddb3a378c6eb0efa65d16dc99ccf69acc847d2dd2bf89b1b5ab08423243ab85
|
3 |
+
size 402803885
|
model_state_layer_19_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1cf07b479e6c8c21cf6254962be145928a66d94022298546c3cbc2d4693f5c1
|
3 |
+
size 402803885
|
model_state_layer_1_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a0953aee2d52eff3c04e7e6be88609780ceb49afbdfb7a48888d65b9a638293
|
3 |
+
size 402803874
|
model_state_layer_20_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b90412d8e8512387a3be56e8776feba9bea8b23f46f0b801ce695fced782e4ed
|
3 |
+
size 402803885
|
model_state_layer_21_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:416511ba15caa0b3231cb875b5ead45c45adb27aa773db5441decf0affdaa6a9
|
3 |
+
size 402803885
|
model_state_layer_22_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5fb53b72db3f69aecc5a3106bd34ef57c74b63db8bd5caf94b17008c12be5113
|
3 |
+
size 402803885
|
model_state_layer_23_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ce43a59282d619f4e1e86e7103e1a27471c0a4fbd7ad20067376a6b5314a64a
|
3 |
+
size 402803885
|
model_state_layer_24_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:207f4d73823910c8cf07d72c557d53fbed0fe25866aaabfb5d6fa10374575d96
|
3 |
+
size 402803885
|
model_state_layer_25_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60227a5d79d26dcda13c96116574b9529b7d467ef2b8257d15af601f4ec72e24
|
3 |
+
size 402803885
|
model_state_layer_26_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c84f768cb95b1daf8b513041ee910009672a1b8471180d5080fdf15c376c1ff4
|
3 |
+
size 402803885
|
model_state_layer_27_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a628bc469cdfe00f9f34144ca67b1500ff9171f4050c423c9703e7609183273
|
3 |
+
size 402803885
|
model_state_layer_28_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d415dd867bf7aa8fa39f7fd7730131c8e149c2ddafd5b4128f72c87b2e8c56ad
|
3 |
+
size 402803885
|
model_state_layer_29_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92f0dab41b00b5a80f9860f0072808e6f2eba40791056fda11b01b76ccd753bb
|
3 |
+
size 402803885
|
model_state_layer_2_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1289782c306a10ddf9c3c4f00cbab98adcfb8092dab9c5dfdd3e7de0cd9881d
|
3 |
+
size 402803874
|
model_state_layer_30_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe3d214468807be82f5f559c12aa8a198b88a763538edffcacc1487832af4c7e
|
3 |
+
size 402803885
|
model_state_layer_31_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55c6d955811f8e64ce5f6f1db85d209f82b1699154816e06d8b561f24661abee
|
3 |
+
size 402803885
|
model_state_layer_32_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca470a34c8cfffdfb88160b4f7e1050e06763045385f73446f7478656d2c76f4
|
3 |
+
size 402803885
|
model_state_layer_33_LayerNormWrapper.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8c8281b03be45b8dd25c6c72b6b44964a7ac1c595ab91abda4c136d9708c0fc
|
3 |
+
size 9650
|
model_state_layer_34_TransformerLMHead.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89bca75db8775f40c4dcebc3edf0fa19927aa6f63dd0399ce48dcb5865a28a32
|
3 |
+
size 536872360
|
model_state_layer_3_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a235f28c9aa19c280d9685421ea7cf85f9d25e6822c1a97c671d1e001eae5c1a
|
3 |
+
size 402803874
|
model_state_layer_4_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9107e3e2b31f27307d540c0696e12f4e328324aa2997db70743eea1be9cc60bf
|
3 |
+
size 402803874
|
model_state_layer_5_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f0664480ca5c512aa015b832a9381c9475e67ba316d1fdc1ffbd18fb81efee06
|
3 |
+
size 402803874
|
model_state_layer_6_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53646a507cfaf7ea44288993034d40b6d6528511ff56d8986d86f0772c5f4032
|
3 |
+
size 402803874
|
model_state_layer_7_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:180ec2fca7563a85e1e52765885b7a25f685c8f549fdd9e50fe9ec68732c79a0
|
3 |
+
size 402803874
|
model_state_layer_8_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5728b6651ea8ff001513b2a0e2d7463c62fbf021df8ed8524fdda267d787b9ba
|
3 |
+
size 402803874
|
model_state_layer_9_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b1893c6df68ece85e3d97e895257bd3de3a15147c09d83088e107b2e50ff8eaf
|
3 |
+
size 402803874
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|