lapp0 commited on
Commit
608f480
·
verified ·
1 Parent(s): 87411d0

End of training

Browse files
README.md CHANGED
@@ -77,7 +77,7 @@ LlamaForCausalLM(
77
 
78
  # Resource Usage
79
 
80
- - Max Train VRAM Use: 13.1269 GB
81
  - Available VRAM: 23.4329 GB
82
  - GPUs:
83
  - 1x NVIDIA GeForce RTX 4090
@@ -107,28 +107,6 @@ LlamaForCausalLM(
107
  (self_attn): LlamaSdpaAttention(
108
  (q_proj): Linear(in_features=576, out_features=576, bias=False)
109
  (k_proj): Linear(in_features=576, out_features=192, bias=False)
110
- @@ -10,17 +10,16 @@
111
- (o_proj): Linear(in_features=576, out_features=576, bias=False)
112
- (rotary_emb): LlamaRotaryEmbedding()
113
- )
114
- - (mlp): LlamaMLP(
115
- + (mlp): LigerSwiGLUMLP(
116
- (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
117
- (up_proj): Linear(in_features=576, out_features=1536, bias=False)
118
- (down_proj): Linear(in_features=1536, out_features=576, bias=False)
119
- - (act_fn): SiLU()
120
- )
121
- - (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
122
- - (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
123
- + (input_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
124
- + (post_attention_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
125
- )
126
- )
127
- - (norm): LlamaRMSNorm((576,), eps=1e-05)
128
- + (norm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
129
- (rotary_emb): LlamaRotaryEmbedding()
130
- )
131
- (lm_head): Linear(in_features=576, out_features=49152, bias=False)
132
 
133
  ```
134
 
@@ -136,7 +114,7 @@ LlamaForCausalLM(
136
  <br/>
137
 
138
  # Train Dataset
139
- Trained on 553,289,312 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
140
 
141
  - Num Samples: `998,000`
142
  - Subset: `20231101.en`
@@ -185,7 +163,7 @@ The following hyperparameters were used during training:
185
  weight=0
186
  )
187
  )`
188
- - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x777e85f7fee0>`
189
  - student_model_name_or_path: `None`
190
  - student_config_name_or_path: `None`
191
  - student_model_config: `{'num_hidden_layers': 15}`
 
77
 
78
  # Resource Usage
79
 
80
+ - Max Train VRAM Use: 13.1273 GB
81
  - Available VRAM: 23.4329 GB
82
  - GPUs:
83
  - 1x NVIDIA GeForce RTX 4090
 
107
  (self_attn): LlamaSdpaAttention(
108
  (q_proj): Linear(in_features=576, out_features=576, bias=False)
109
  (k_proj): Linear(in_features=576, out_features=192, bias=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  ```
112
 
 
114
  <br/>
115
 
116
  # Train Dataset
117
+ Trained on 553,266,374 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
118
 
119
  - Num Samples: `998,000`
120
  - Subset: `20231101.en`
 
163
  weight=0
164
  )
165
  )`
166
+ - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x777cd1a972b0>`
167
  - student_model_name_or_path: `None`
168
  - student_config_name_or_path: `None`
169
  - student_model_config: `{'num_hidden_layers': 15}`
logs/learning_rate=0.0001, lr_scheduler_kwargs=__power___1.0___lr_end___2e-05_, lr_scheduler_type=polynomial, per_device_train_batch_size=8/events.out.tfevents.1726659900.1c1a426a2fee ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b3327c6372ca3250e36ae5073eb60a778691dff1e2a96f7f85f8305ad3d8230
3
+ size 529