Upload folder using huggingface_hub
Browse files- config.json +1 -0
- state_step19560.pt +3 -0
- train_config.json +1 -0
config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"data": {"train_files": "data/fineweb_10B_gpt2/fineweb_train_*.bin", "train_tokens": null, "val_files": "data/fineweb_10B_gpt2/fineweb_val_*.bin", "val_tokens": null, "batch_size": 512, "device_batch_size": 32}, "model": {"dim": 768, "n_layers": 12, "n_heads": 12, "n_kv_heads": 12, "vocab_size": 50257, "multiple_of": 256, "ffn_dim_multiplier": 4, "norm_eps": 1e-05, "rope_theta": 10000, "use_scaled_rope": false, "max_seq_len": 1024, "initializer_range": 0.02, "zero_init_masks": false}, "optimizer": {"default": {"lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0}, "masks": {"lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0}, "norms": {"lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0}}, "scheduler": {"warmup_steps": 0.1, "start_factor": 0.1}, "gates": {"mean_targets": "auto", "mean_target_start": 1.0, "mean_target_end": 0.7, "var_target_delta": 0, "mean_coef_init": 0, "var_coef_init": 0, "coef_max": Infinity, "coef_min": -Infinity, "ema_steps_short": 1, "ema_steps_long": 10, "delta_min": 0.01, "coef_update_multiplier": 0.001}, "gates_zero_eps": 1e-08, "seed": 0, "project": "fineweb-gated", "run_id": null, "logdir": "logs/fineweb-gated", "log_gradients": false, "log_params": false, "log_every_steps": 10, "val_every_steps": -1, "save_every_steps": -1}
|
state_step19560.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8da1a72d978bea8270aa3357757ee6d938cd3ea84d743eb26f19c8222a0ed1e4
|
| 3 |
+
size 2054411826
|
train_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"data": {"train_files": "data/fineweb_10B_gpt2/fineweb_train_*.bin", "train_tokens": null, "val_files": "data/fineweb_10B_gpt2/fineweb_val_*.bin", "val_tokens": null, "batch_size": 512, "device_batch_size": 32}, "model": {"dim": 768, "n_layers": 12, "n_heads": 12, "n_kv_heads": 12, "vocab_size": 50257, "multiple_of": 256, "ffn_dim_multiplier": 4, "norm_eps": 1e-05, "rope_theta": 10000, "use_scaled_rope": false, "max_seq_len": 1024, "initializer_range": 0.02, "zero_init_masks": false}, "optimizer": {"default": {"lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0}, "masks": {"lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0}, "norms": {"lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0}}, "scheduler": {"warmup_steps": 0.1, "start_factor": 0.1}, "gates": {"mean_targets": "auto", "mean_target_start": 1.0, "mean_target_end": 0.7, "var_target_delta": 0, "mean_coef_init": 0, "var_coef_init": 0, "coef_max": Infinity, "coef_min": -Infinity, "ema_steps_short": 1, "ema_steps_long": 10, "delta_min": 0.01, "coef_update_multiplier": 0.001}, "gates_zero_eps": 1e-08, "seed": 0, "project": "fineweb-gated", "run_id": null, "logdir": "logs/fineweb-gated", "log_gradients": false, "log_params": false, "log_every_steps": 10, "val_every_steps": -1, "save_every_steps": -1}
|