sagadre commited on
Commit
4c44549
1 Parent(s): 6ade3a7
c4_original-open_lm_7b-1.0/checkpoints/epoch_17.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b04707ca718e40d012b9bb43b38c01b2f5684884d81144a744a4eaf7df43e138
3
+ size 27560991570
c4_original-open_lm_7b-1.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 1
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-open_lm_7b-1.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: False
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 1
44
+ global_val_batch_size: 1
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-open_lm_7b-1.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.0003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: open_lm_7b
64
+ model_norm: lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-open_lm_7b-1.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 1
77
+ per_gpu_val_batch_size: 1
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-open_lm_7b-1.0/checkpoints/epoch_17.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/de-en/val_de-en_000.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_010.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_020.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_030.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_040.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_050.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_060.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_070.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_080.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_090.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_100.tar']
108
+ val_data_key: ['json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 5000
120
+ wd: 0.33
121
+ workers: 2
122
+ world_size: 1
123
+ z_loss_coefficient: 0.0001
rpj-open_lm_7b-1.0/checkpoints/epoch_39.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c1a2008f6f48a3a406b047a4f3ba689fe4c00c50362477c416c2807cdca19f
3
+ size 27560991506
rpj-open_lm_7b-1.0/params.txt ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 4
2
+ average: None
3
+ average_coefficients: None
4
+ batch_size: 16
5
+ beta1: 0.9
6
+ beta2: 0.95
7
+ checkpoint_path: ./logs/vaishaal_open_lm_7b_cc1_without_replacement_137B_tokens_h100_rpj_only/checkpoints
8
+ copy_codebase: False
9
+ data_key: json
10
+ dataset_manifest: ['s3://permanent-813987666268/users/vaishaal/mlr/open_lm/rpj_tokenized_upsampled_eleutherai/manifest.jsonl']
11
+ dataset_resampled: False
12
+ dataset_type: auto
13
+ ddp_static_graph: False
14
+ debug: False
15
+ delete_previous_checkpoint: False
16
+ device: cuda:0
17
+ disable_buffer: False
18
+ dist_backend: nccl
19
+ dist_url: env://
20
+ distill_model: None
21
+ distill_pretrained: None
22
+ distributed: True
23
+ epochs: 64
24
+ epochs_cooldown: None
25
+ eps: 1e-08
26
+ ffn_type: swiglu
27
+ force_min_lr: 0.0
28
+ fsdp: True
29
+ fsdp_amp: False
30
+ fsdp_backward_prefetch: False
31
+ fsdp_checkpoint: False
32
+ fsdp_cpu_offload: False
33
+ fsdp_hybrid: False
34
+ fsdp_hybrid_o2: False
35
+ fsdp_limit_all_gathers: True
36
+ fsdp_pure_bf16: True
37
+ fsdp_use_orig_params: False
38
+ grad_checkpointing: False
39
+ grad_clip_norm: 1.0
40
+ hf_fsdp_block: None
41
+ hf_model: None
42
+ hf_seq_len: None
43
+ ignore_parse_errors: True
44
+ load_pretrained_state: False
45
+ local_rank: 0
46
+ log_every_n_steps: 20
47
+ log_level: 20
48
+ log_local: False
49
+ log_logit_mean: False
50
+ log_path: ./logs/vaishaal_open_lm_7b_cc1_without_replacement_137B_tokens_h100_rpj_only/out.log
51
+ logs: ./logs/
52
+ lr: 0.0003
53
+ lr_cooldown_end: 3e-05
54
+ lr_cooldown_power: 1.0
55
+ lr_scheduler: cosine
56
+ model: open_lm_7b
57
+ model_norm: lp_layer_norm
58
+ name: vaishaal_open_lm_7b_cc1_without_replacement_137B_tokens_h100_rpj_only
59
+ no_set_device_rank: False
60
+ optimizer: adamw
61
+ positional_embedding_type: rotary
62
+ precision: amp_bfloat16
63
+ pretrained: None
64
+ qk_norm: True
65
+ rank: 0
66
+ remote_sync: s3://permanent-813987666268/users/vaishaal/mlr/open_lm/checkpoints
67
+ remote_sync_frequency: 300
68
+ remote_sync_protocol: s3
69
+ report_to: wandb
70
+ resume: s3://permanent-813987666268/users/vaishaal/mlr/open_lm/checkpoints/vaishaal_open_lm_7b_cc1_without_replacement_137B_tokens_h100_rpj_only/checkpoints/epoch_23.pt
71
+ save_frequency: 1
72
+ save_most_recent: False
73
+ seed: 124
74
+ seq_len: 2048
75
+ skip_scheduler: False
76
+ target_mask_individual: None
77
+ target_mask_left: None
78
+ tensorboard: False
79
+ tensorboard_path:
80
+ torchcompile: False
81
+ torchscript: False
82
+ trace: False
83
+ train_data: None
84
+ train_data_mix_weights: None
85
+ train_data_upsampling_factors: None
86
+ train_num_samples: 1052856
87
+ use_bn_sync: False
88
+ use_bnb_linear: None
89
+ val_data: None
90
+ val_frequency: 1
91
+ val_num_samples: None
92
+ vocab_size: 50432
93
+ wandb: True
94
+ wandb_notes:
95
+ wandb_project_name: open_lm
96
+ warmup: 5000
97
+ wd: 0.1
98
+ workers: 4
99
+ world_size: 64
100
+ z_loss_coefficient: 0.0001
rw_original-open_lm_7b-1.0/checkpoints/epoch_47.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:169754a84143f9b9956684e0596a67a8379576beee50e186220abe277d0dd422
3
+ size 27560991506
rw_original-open_lm_7b-1.0/params.txt ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 4
2
+ average: None
3
+ average_coefficients: None
4
+ batch_size: 16
5
+ beta1: 0.9
6
+ beta2: 0.95
7
+ checkpoint_path: ./logs/refined_web_7b_CC1_experiment_16_nodes/checkpoints
8
+ copy_codebase: False
9
+ data_key: json.gz
10
+ dataset_manifest: ['s3://permanent-813987666268/users/vaishaal/mlr/refined_web_tokenized/manifest.jsonl']
11
+ dataset_resampled: False
12
+ dataset_type: auto
13
+ ddp_static_graph: False
14
+ debug: False
15
+ delete_previous_checkpoint: False
16
+ device: cuda:0
17
+ disable_buffer: False
18
+ dist_backend: nccl
19
+ dist_url: env://
20
+ distill_model: None
21
+ distill_pretrained: None
22
+ distributed: True
23
+ epochs: 64
24
+ epochs_cooldown: None
25
+ eps: 1e-08
26
+ ffn_type: swiglu
27
+ force_min_lr: 0.0
28
+ fsdp: True
29
+ fsdp_amp: False
30
+ fsdp_backward_prefetch: False
31
+ fsdp_checkpoint: False
32
+ fsdp_cpu_offload: False
33
+ fsdp_hybrid: False
34
+ fsdp_hybrid_o2: False
35
+ fsdp_limit_all_gathers: True
36
+ fsdp_pure_bf16: True
37
+ fsdp_use_orig_params: False
38
+ grad_checkpointing: False
39
+ grad_clip_norm: 1.0
40
+ hf_fsdp_block: None
41
+ hf_model: None
42
+ hf_seq_len: None
43
+ ignore_parse_errors: True
44
+ load_pretrained_state: False
45
+ local_rank: 0
46
+ log_every_n_steps: 20
47
+ log_level: 20
48
+ log_local: False
49
+ log_logit_mean: False
50
+ log_path: ./logs/refined_web_7b_CC1_experiment_16_nodes/out.log
51
+ logs: ./logs/
52
+ lr: 0.0003
53
+ lr_cooldown_end: 3e-05
54
+ lr_cooldown_power: 1.0
55
+ lr_scheduler: cosine
56
+ model: open_lm_7b
57
+ model_norm: lp_layer_norm
58
+ name: refined_web_7b_CC1_experiment_16_nodes
59
+ no_set_device_rank: False
60
+ optimizer: adamw
61
+ positional_embedding_type: rotary
62
+ precision: amp_bfloat16
63
+ pretrained: None
64
+ qk_norm: True
65
+ rank: 0
66
+ remote_sync: s3://permanent-813987666268/users/vaishaal/mlr/open_lm/checkpoints
67
+ remote_sync_frequency: 300
68
+ remote_sync_protocol: s3
69
+ report_to: wandb
70
+ resume: None
71
+ save_frequency: 1
72
+ save_most_recent: False
73
+ seed: 124
74
+ seq_len: 2048
75
+ skip_scheduler: False
76
+ target_mask_individual: None
77
+ target_mask_left: None
78
+ tensorboard: False
79
+ tensorboard_path:
80
+ torchcompile: False
81
+ torchscript: False
82
+ trace: False
83
+ train_data: None
84
+ train_data_mix_weights: None
85
+ train_data_upsampling_factors: None
86
+ train_num_samples: 1052856
87
+ use_bn_sync: False
88
+ use_bnb_linear: None
89
+ val_batch_size: None
90
+ val_data: None
91
+ val_data_key: txt
92
+ val_frequency: 1
93
+ val_num_samples: None
94
+ vocab_size: 50432
95
+ wandb: True
96
+ wandb_notes:
97
+ wandb_project_name: open_lm
98
+ warmup: 5000
99
+ wd: 0.1
100
+ workers: 4
101
+ world_size: 128
102
+ z_loss_coefficient: 0.0001