Add files using upload-large-folder tool
Browse files- .gitattributes +9 -0
- 0000200000/.metadata +3 -0
- 0000200000/__0_0.distcp +3 -0
- 0000200000/__1_0.distcp +3 -0
- 0000200000/__2_0.distcp +3 -0
- 0000200000/__3_0.distcp +3 -0
- 0000200000/__4_0.distcp +3 -0
- 0000200000/__5_0.distcp +3 -0
- 0000200000/__6_0.distcp +3 -0
- 0000200000/__7_0.distcp +3 -0
- 0000200000/consolidated/consolidated.pth +3 -0
- 0000200000/consolidated/params.json +1 -0
- 0000200000/params.json +1 -0
- 0000200000/train_state_00000.json +1 -0
- 0000200000/train_state_00001.json +1 -0
- 0000200000/train_state_00002.json +1 -0
- 0000200000/train_state_00003.json +1 -0
- 0000200000/train_state_00004.json +1 -0
- 0000200000/train_state_00005.json +1 -0
- 0000200000/train_state_00006.json +1 -0
- 0000200000/train_state_00007.json +1 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
0000200000/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
0000200000/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
0000200000/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
0000200000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
0000200000/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
0000200000/.metadata filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
0000200000/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
0000200000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
0000200000/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
|
0000200000/.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f1f964897951b3963cadc2db42e3732027ff8c5c8f70007730f23f467585f64e
|
| 3 |
+
size 553385
|
0000200000/__0_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7472baec13695ad8e3545f449eaae05b7fb2ef93beb62feda8fad7a7eaebbfec
|
| 3 |
+
size 62147052
|
0000200000/__1_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5589dec9189f25295e2272d85fe18b00b44dc6d2820df8a5d7e5297ec78c07a9
|
| 3 |
+
size 62171564
|
0000200000/__2_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a27be60f0b01937046eed562c0db810e8a08d6d5a14f7b297f79e678a317b066
|
| 3 |
+
size 62166796
|
0000200000/__3_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86ed276b433080d1667321a2fc87255e5382bd0519191b82e7307d18ec89334c
|
| 3 |
+
size 62166796
|
0000200000/__4_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec63a66d306d7092ea08ead108c25da9e94983c212b29f401883c98ca002dd59
|
| 3 |
+
size 62166796
|
0000200000/__5_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:155aa45d74f82e93e6fcc6ce216d4d6501d8df6b537421a49404ca319c89d886
|
| 3 |
+
size 62166796
|
0000200000/__6_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:14a08cfabd19e9da5ab3dbfb3cc167ceef47e0278257e7afbbb605e038dda7df
|
| 3 |
+
size 62166796
|
0000200000/__7_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1540e1869b0ec05f898db2a9268f84a752a08fd6e1be419950d788c0d3d16149
|
| 3 |
+
size 62169072
|
0000200000/consolidated/consolidated.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0cdeb6f523d0576513e5c4c1f49037c4c62ce4b310fc88f71cd841d8a3ff0e38
|
| 3 |
+
size 494168442
|
0000200000/consolidated/params.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"name": "checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python", "dump_dir": "checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python", "seed": 42, "grad_acc_steps": 1, "gc_collect_freq": 1000, "probe_freq": null, "steps": 200000, "data": {"root_dir": "data", "sources": {"m1_python": 1.0}, "batch_size": 8, "seq_len": 2048, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 64, "tokenizer": {"name": "bytes", "path": null}}, "optim": {"lr": 0.001, "weight_decay": 0.1, "epsilon": 1e-15, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 1000, "lr_min_ratio": 0.001, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 512, "n_layers": 12, "head_dim": null, "n_heads": 8, "n_kv_heads": null, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "max_seqlen": 2048, "seed": 42, "vocab_size": 256, "weight_tying": false, "sliding_window": null}, "distributed": {"dp_shard": 1, "dp_replicate": 8, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 10000, "keep": 5}, "eval": {"every": 1000, "keep": 0}, "path": "checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python/checkpoints", "init_ckpt_path": null, "continue_training_from_init": false}, "profiling": {"run": false, "trace_folder": "profiling", "mem_warmup": 100, "mem_steps": 2, "profile_warmup": 102, "profile_steps": 2}, "logging": {"freq": 10, "acc_freq": null, "wandb": {"job_type": null, "dir": null, "project": "m1-byte-model-training", "entity": "linzhengisme-hku", "name": "checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python"}}, "async_eval_gpus": null, "eval": null}
|
0000200000/params.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"name": "checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python", "dump_dir": "checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python", "seed": 42, "grad_acc_steps": 1, "gc_collect_freq": 1000, "probe_freq": null, "steps": 200000, "data": {"root_dir": "data", "sources": {"m1_python": 1.0}, "batch_size": 8, "seq_len": 2048, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 64, "tokenizer": {"name": "bytes", "path": null}}, "optim": {"lr": 0.001, "weight_decay": 0.1, "epsilon": 1e-15, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 1000, "lr_min_ratio": 0.001, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 512, "n_layers": 12, "head_dim": null, "n_heads": 8, "n_kv_heads": null, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "max_seqlen": 2048, "seed": 42, "vocab_size": 256, "weight_tying": false, "sliding_window": null}, "distributed": {"dp_shard": 1, "dp_replicate": 8, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 10000, "keep": 5}, "eval": {"every": 1000, "keep": 0}, "path": "checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python/checkpoints", "init_ckpt_path": null, "continue_training_from_init": false}, "profiling": {"run": false, "trace_folder": "profiling", "mem_warmup": 100, "mem_steps": 2, "profile_warmup": 102, "profile_steps": 2}, "logging": {"freq": 10, "acc_freq": null, "wandb": {"job_type": null, "dir": null, "project": "m1-byte-model-training", "entity": "linzhengisme-hku", "name": "checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python"}}, "async_eval_gpus": null, "eval": null}
|
0000200000/train_state_00000.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 200000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 3138, "it_state": {"it_state": {"root_dir": "data", "sources": {"m1_python": 1.0}, "source_to_state": {"m1_python": {"file_path": "data/m1_python/m1.chunk.0.jsonl", "position": 29086182587, "block_size": 8, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 196159135155116677932205305920914360454, "inc": 252101603063402394885084957393789173453}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 2048, "n_views": 2}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 159089502457649373559076993564572206274, "inc": 257317082376085721142933171929815648017}, "has_uint32": 1, "uinteger": 2436652526}, "batch_size": 8, "prefetch_size": 64}, "scheduler": {"base_lrs": [0.001], "last_epoch": 200000, "verbose": false, "_step_count": 200001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
|
0000200000/train_state_00001.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 200000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1451, "it_state": {"it_state": {"root_dir": "data", "sources": {"m1_python": 1.0}, "source_to_state": {"m1_python": {"file_path": "data/m1_python/m1.chunk.0.jsonl", "position": 29190456736, "block_size": 8, "offset": 1, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 282118061576371729129189667169591116591, "inc": 246509925186285949978196491240064802315}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 2048, "n_views": 2}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 92564840609230895665670265469022350102, "inc": 173555323965545256606922338259303677603}, "has_uint32": 1, "uinteger": 2825050161}, "batch_size": 8, "prefetch_size": 64}, "scheduler": {"base_lrs": [0.001], "last_epoch": 200000, "verbose": false, "_step_count": 200001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
|
0000200000/train_state_00002.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 200000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1355, "it_state": {"it_state": {"root_dir": "data", "sources": {"m1_python": 1.0}, "source_to_state": {"m1_python": {"file_path": "data/m1_python/m1.chunk.0.jsonl", "position": 28949236042, "block_size": 8, "offset": 2, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 271684270557691219047439292967650232726, "inc": 234358335530849485425064040311006256713}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 2048, "n_views": 2}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 305702843431827399941012250571887074994, "inc": 319170006889470250209362588441616495209}, "has_uint32": 0, "uinteger": 69549507}, "batch_size": 8, "prefetch_size": 64}, "scheduler": {"base_lrs": [0.001], "last_epoch": 200000, "verbose": false, "_step_count": 200001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
|
0000200000/train_state_00003.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 200000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1342, "it_state": {"it_state": {"root_dir": "data", "sources": {"m1_python": 1.0}, "source_to_state": {"m1_python": {"file_path": "data/m1_python/m1.chunk.0.jsonl", "position": 29018902899, "block_size": 8, "offset": 3, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 151794252599659359375122450662351937557, "inc": 148211758571781046255077612135386035203}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 2048, "n_views": 2}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 266528125865865975445604700062343295539, "inc": 115810872492597857501795428972873905393}, "has_uint32": 1, "uinteger": 2963635329}, "batch_size": 8, "prefetch_size": 64}, "scheduler": {"base_lrs": [0.001], "last_epoch": 200000, "verbose": false, "_step_count": 200001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
|
0000200000/train_state_00004.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 200000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 4586, "it_state": {"it_state": {"root_dir": "data", "sources": {"m1_python": 1.0}, "source_to_state": {"m1_python": {"file_path": "data/m1_python/m1.chunk.0.jsonl", "position": 29058626970, "block_size": 8, "offset": 4, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 185900580391629526145992749286354150885, "inc": 186633262021180533256729114674950595327}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 2048, "n_views": 2}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 29703770416253557178631657187237319654, "inc": 303111205818808944921858206842105131807}, "has_uint32": 1, "uinteger": 258524144}, "batch_size": 8, "prefetch_size": 64}, "scheduler": {"base_lrs": [0.001], "last_epoch": 200000, "verbose": false, "_step_count": 200001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
|
0000200000/train_state_00005.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 200000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1126, "it_state": {"it_state": {"root_dir": "data", "sources": {"m1_python": 1.0}, "source_to_state": {"m1_python": {"file_path": "data/m1_python/m1.chunk.0.jsonl", "position": 29037285996, "block_size": 8, "offset": 5, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 229087085351381209439456053365606859311, "inc": 329233669073478483697346584247981015037}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 2048, "n_views": 2}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 124590788042326845519645168297407832085, "inc": 47382953940698287647753879262736142901}, "has_uint32": 1, "uinteger": 2356543744}, "batch_size": 8, "prefetch_size": 64}, "scheduler": {"base_lrs": [0.001], "last_epoch": 200000, "verbose": false, "_step_count": 200001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
|
0000200000/train_state_00006.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 200000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 5165, "it_state": {"it_state": {"root_dir": "data", "sources": {"m1_python": 1.0}, "source_to_state": {"m1_python": {"file_path": "data/m1_python/m1.chunk.0.jsonl", "position": 28928035428, "block_size": 8, "offset": 6, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 323442135590335487289480792993066470867, "inc": 95963489890761403814531195999220475639}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 2048, "n_views": 2}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 193506317864134853952122039857157180385, "inc": 72545526324180839152750112646078969085}, "has_uint32": 0, "uinteger": 1999884170}, "batch_size": 8, "prefetch_size": 64}, "scheduler": {"base_lrs": [0.001], "last_epoch": 200000, "verbose": false, "_step_count": 200001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
|
0000200000/train_state_00007.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 200000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 5124, "it_state": {"it_state": {"root_dir": "data", "sources": {"m1_python": 1.0}, "source_to_state": {"m1_python": {"file_path": "data/m1_python/m1.chunk.0.jsonl", "position": 29055996285, "block_size": 8, "offset": 7, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 295975044814721532446505432440210883668, "inc": 53245743019587277358203950863334653629}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 2048, "n_views": 2}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 81004841028754437653436458738363201675, "inc": 19761753544780285878460645500694854795}, "has_uint32": 0, "uinteger": 2212175585}, "batch_size": 8, "prefetch_size": 64}, "scheduler": {"base_lrs": [0.001], "last_epoch": 200000, "verbose": false, "_step_count": 200001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
|