lapp0 commited on
Commit
e137ab6
·
verified ·
1 Parent(s): 7273e96

End of training

Browse files
README.md CHANGED
@@ -107,28 +107,6 @@ LlamaForCausalLM(
107
  (self_attn): LlamaSdpaAttention(
108
  (q_proj): Linear(in_features=576, out_features=576, bias=False)
109
  (k_proj): Linear(in_features=576, out_features=192, bias=False)
110
- @@ -10,17 +10,16 @@
111
- (o_proj): Linear(in_features=576, out_features=576, bias=False)
112
- (rotary_emb): LlamaRotaryEmbedding()
113
- )
114
- - (mlp): LlamaMLP(
115
- + (mlp): LigerSwiGLUMLP(
116
- (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
117
- (up_proj): Linear(in_features=576, out_features=1536, bias=False)
118
- (down_proj): Linear(in_features=1536, out_features=576, bias=False)
119
- - (act_fn): SiLU()
120
- )
121
- - (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
122
- - (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
123
- + (input_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
124
- + (post_attention_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
125
- )
126
- )
127
- - (norm): LlamaRMSNorm((576,), eps=1e-05)
128
- + (norm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
129
- (rotary_emb): LlamaRotaryEmbedding()
130
- )
131
- (lm_head): Linear(in_features=576, out_features=49152, bias=False)
132
 
133
  ```
134
 
@@ -136,7 +114,7 @@ LlamaForCausalLM(
136
  <br/>
137
 
138
  # Train Dataset
139
- Trained on 687,245,234 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
140
 
141
  - Num Samples: `1,996,000`
142
  - Subset: `20231101.en`
@@ -166,12 +144,13 @@ The following hyperparameters were used during training:
166
  <details>
167
  <summary>Expand</summary>
168
 
169
- - learning_rate: `0.0001`
170
  - train_batch_size: `16`
171
  - eval_batch_size: `2`
172
  - seed: `42`
173
  - optimizer: `Adam with betas=(0.9,0.999) and epsilon=1e-08`
174
  - lr_scheduler_type: `polynomial`
 
175
  - num_epochs: `1.0`
176
  - distillation_objective: `DistillationObjective(
177
  logits_loss_component=LossComponent(
@@ -185,7 +164,7 @@ The following hyperparameters were used during training:
185
  weight=0
186
  )
187
  )`
188
- - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x76c9e8244e20>`
189
  - student_model_name_or_path: `None`
190
  - student_config_name_or_path: `None`
191
  - student_model_config: `{'num_hidden_layers': 15}`
@@ -209,7 +188,7 @@ The following hyperparameters were used during training:
209
  - gradient_accumulation_steps: `1`
210
  - weight_decay: `0.0`
211
  - max_grad_norm: `1.0`
212
- - warmup_ratio: `0.0`
213
  - warmup_steps: `0`
214
  - gradient_checkpointing: `True`
215
 
 
107
  (self_attn): LlamaSdpaAttention(
108
  (q_proj): Linear(in_features=576, out_features=576, bias=False)
109
  (k_proj): Linear(in_features=576, out_features=192, bias=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  ```
112
 
 
114
  <br/>
115
 
116
  # Train Dataset
117
+ Trained on 687,248,443 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
118
 
119
  - Num Samples: `1,996,000`
120
  - Subset: `20231101.en`
 
144
  <details>
145
  <summary>Expand</summary>
146
 
147
+ - learning_rate: `0.0002`
148
  - train_batch_size: `16`
149
  - eval_batch_size: `2`
150
  - seed: `42`
151
  - optimizer: `Adam with betas=(0.9,0.999) and epsilon=1e-08`
152
  - lr_scheduler_type: `polynomial`
153
+ - lr_scheduler_warmup_ratio: `0.1`
154
  - num_epochs: `1.0`
155
  - distillation_objective: `DistillationObjective(
156
  logits_loss_component=LossComponent(
 
164
  weight=0
165
  )
166
  )`
167
+ - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x76ca0d527850>`
168
  - student_model_name_or_path: `None`
169
  - student_config_name_or_path: `None`
170
  - student_model_config: `{'num_hidden_layers': 15}`
 
188
  - gradient_accumulation_steps: `1`
189
  - weight_decay: `0.0`
190
  - max_grad_norm: `1.0`
191
+ - warmup_ratio: `0.1`
192
  - warmup_steps: `0`
193
  - gradient_checkpointing: `True`
194
 
logs/dataset_max_seq_length=512, dataset_sample_size=2000000, per_device_train_batch_size=16, warmup_ratio=0.1/events.out.tfevents.1726422177.1c1a426a2fee ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f67b97d9b0dce17aa6806d6ece3a1b8db3aaccd655ecf36fb23a26bdfc8ed070
3
+ size 529