End of training

Browse files

Files changed (2) hide show

README.md +3 -25
logs/learning_rate=0.0001, lr_scheduler_kwargs=__power___1.0___lr_end___2e-05_, lr_scheduler_type=polynomial, per_device_train_batch_size=8/events.out.tfevents.1726659900.1c1a426a2fee +3 -0

README.md CHANGED Viewed

@@ -77,7 +77,7 @@ LlamaForCausalLM(
 # Resource Usage
-- Max Train VRAM Use: 13.1269 GB
 - Available VRAM: 23.4329 GB
 - GPUs:
   - 1x NVIDIA GeForce RTX 4090
@@ -107,28 +107,6 @@ LlamaForCausalLM(
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=576, out_features=576, bias=False)
            (k_proj): Linear(in_features=576, out_features=192, bias=False)
-@@ -10,17 +10,16 @@
-           (o_proj): Linear(in_features=576, out_features=576, bias=False)
-           (rotary_emb): LlamaRotaryEmbedding()
-         )
--        (mlp): LlamaMLP(
-+        (mlp): LigerSwiGLUMLP(
-           (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
-           (up_proj): Linear(in_features=576, out_features=1536, bias=False)
-           (down_proj): Linear(in_features=1536, out_features=576, bias=False)
--          (act_fn): SiLU()
-         )
--        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
--        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
-+        (input_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
-+        (post_attention_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
-       )
-     )
--    (norm): LlamaRMSNorm((576,), eps=1e-05)
-+    (norm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
-     (rotary_emb): LlamaRotaryEmbedding()
-   )
-   (lm_head): Linear(in_features=576, out_features=49152, bias=False)
 ```
@@ -136,7 +114,7 @@ LlamaForCausalLM(
 <br/>
 # Train Dataset
-Trained on 553,289,312 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
 - Num Samples: `998,000`
 - Subset: `20231101.en`
@@ -185,7 +163,7 @@ The following hyperparameters were used during training:
         weight=0
     )
 )`
-- lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x777e85f7fee0>`
 - student_model_name_or_path: `None`
 - student_config_name_or_path: `None`
 - student_model_config: `{'num_hidden_layers': 15}`

 # Resource Usage
+- Max Train VRAM Use: 13.1273 GB
 - Available VRAM: 23.4329 GB
 - GPUs:
   - 1x NVIDIA GeForce RTX 4090
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=576, out_features=576, bias=False)
            (k_proj): Linear(in_features=576, out_features=192, bias=False)
 ```
 <br/>
 # Train Dataset
+Trained on 553,266,374 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
 - Num Samples: `998,000`
 - Subset: `20231101.en`
         weight=0
     )
 )`
+- lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x777cd1a972b0>`
 - student_model_name_or_path: `None`
 - student_config_name_or_path: `None`
 - student_model_config: `{'num_hidden_layers': 15}`

logs/learning_rate=0.0001, lr_scheduler_kwargs=__power___1.0___lr_end___2e-05_, lr_scheduler_type=polynomial, per_device_train_batch_size=8/events.out.tfevents.1726659900.1c1a426a2fee ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b3327c6372ca3250e36ae5073eb60a778691dff1e2a96f7f85f8305ad3d8230
+size 529